From 69d5f78b1682fc11c2f19a2e271e663490a2eed3 Mon Sep 17 00:00:00 2001
From: JD <Jehandad.Khan@amd.com>
Date: Thu, 12 May 2022 09:21:01 -0500
Subject: [PATCH] Add host API (#220)

* Add host API

* manually rebase on develop

* clean

* manually rebase on develop

* exclude tests from all target

* address review comments

* update client app name

* fix missing lib name

* clang-format update

* refactor

* refactor

* refactor

* refactor

* refactor

* fix test issue

* refactor

* refactor

* refactor

* upate cmake and readme

Co-authored-by: Chao Liu <chao.liu2@amd.com>

[ROCm/composable_kernel commit: cec69bc3bc200de7e09396579fe33cb153f8afeb]
---
 CMakeLists.txt                                |  30 ++-
 Config.cmake.in                               |  11 +
 Dockerfile                                    |  15 +-
 Jenkinsfile                                   |  19 +-
 README.md                                     |  10 +
 cmake/googletest.cmake                        |   3 +-
 example/01_gemm/gemm_xdl_bf16.cpp             |  14 +-
 example/01_gemm/gemm_xdl_fp16.cpp             |  14 +-
 example/01_gemm/gemm_xdl_int8.cpp             |  14 +-
 .../gemm_xdl_alpha_beta.cpp                   |  16 +-
 .../03_gemm_bias_relu/gemm_xdl_bias_relu.cpp  |  14 +-
 .../gemm_xdl_bias_relu_add.cpp                |  14 +-
 .../conv2d_fwd_xdl_bias_relu.cpp              |  12 +-
 .../conv2d_fwd_xdl_bias_relu_add.cpp          |  12 +-
 example/09_convnd_fwd/convnd_fwd_xdl.cpp      |  12 +-
 example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp |  12 +-
 example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp |  12 +-
 .../conv2d_bwd_data_xdl.cpp                   |  14 +-
 .../conv2d_bwd_weight_xdl.cpp                 |  14 +-
 example/12_reduce/reduce_blockwise.cpp        |  13 +-
 example/13_pool2d_fwd/pool2d_fwd.cpp          |  14 +-
 .../gemm_xdl_requant_relu_requant_int8.cpp    |  14 +-
 .../15_grouped_gemm/grouped_gemm_xdl_fp16.cpp |  12 +-
 .../16_gemm_reduce/gemm_reduce_xdl_fp16.cpp   |  39 +---
 .../convnd_bwd_data_xdl.cpp                   |  14 +-
 .../batched_gemm_reduce_xdl_fp16.cpp          |  39 +---
 include/ck/hip_version.hpp.in                 |  28 ---
 include/ck/options.hpp.in                     |   3 +
 include/ck/stream_config.hpp                  |  10 +
 .../gpu/device/device_base.hpp                |  15 +-
 ...evice_batched_gemm_reduce_xdl_cshuffle.hpp |  94 ++++----
 .../gpu/device/device_batched_gemm_xdl.hpp    |  15 +-
 ...rd_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp |  70 ++----
 ...ice_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp |  11 +-
 ...fle_bias_activation_add_nhwc_kyxc_nhwk.hpp |  11 +-
 ...shuffle_bias_activation_nhwc_kyxc_nhwk.hpp |  11 +-
 ...onv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp |  11 +-
 .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp  |  15 +-
 ...ice_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp |  11 +-
 ...evice_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp |  15 +-
 ..._convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp |  11 +-
 .../device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp  |  15 +-
 .../device_gemm_reduce_xdl_cshuffle.hpp       |  86 +++----
 .../gpu/device/device_gemm_xdl.hpp            |  15 +-
 .../device_gemm_xdl_c_shuffle_bias_2d.hpp     |  11 +-
 ...ice_gemm_xdl_c_shuffle_bias_activation.hpp |  11 +-
 ...gemm_xdl_c_shuffle_bias_activation_add.hpp |  11 +-
 .../gpu/device/device_gemm_xdl_cshuffle.hpp   | 112 +++------
 .../gpu/device/device_gemm_xdl_splitk.hpp     |  74 +++---
 .../device_gemm_xdl_splitk_c_shuffle.hpp      |  74 +++---
 .../gpu/device/device_grouped_gemm_xdl.hpp    |  15 +-
 .../device/device_pool2d_fwd_nhwc_nhwc.hpp    |  11 +-
 .../gpu/device/device_reduce_blockwise.hpp    |  11 +-
 .../device_reduce_blockwise_second_call.hpp   |  13 +-
 .../device_reduce_multiblock_atomic_add.hpp   |  72 +++---
 ...evice_reduce_multiblock_partial_reduce.hpp |  13 +-
 .../gpu/device/device_reduce_threadwise.hpp   |  13 +-
 .../ck/library/host/host_interface.hpp        |  54 +++++
 .../include/ck/library/host_tensor/device.hpp | 109 +++++----
 .../cpu/reference_batched_gemm.hpp            |   3 +-
 .../cpu/reference_conv_backward_weight.hpp    |   3 +-
 .../cpu/reference_conv_bwd_data.hpp           |   3 +-
 .../cpu/reference_conv_fwd.hpp                |   9 +-
 .../reference_conv_fwd_bias_activation.hpp    |   3 +-
 ...reference_conv_fwd_bias_activation_add.hpp |   3 +-
 .../cpu/reference_gemm.hpp                    |   3 +-
 .../cpu/reference_gemm_bias_2d.hpp            |   3 +-
 .../cpu/reference_gemm_bias_activation.hpp    |   3 +-
 .../reference_gemm_bias_activation_add.hpp    |   3 +-
 .../ck/library/utility/op_instance_engine.hpp |   4 +-
 library/src/host_tensor/CMakeLists.txt        |  25 +-
 library/src/host_tensor/device.cpp            |  29 ++-
 .../gpu/CMakeLists.txt                        |  73 +++++-
 .../gpu/batched_gemm/CMakeLists.txt           |   6 +-
 .../gpu/batched_gemm_reduce/CMakeLists.txt    |   5 +-
 .../gpu/conv1d_fwd/CMakeLists.txt             |   6 +-
 .../gpu/conv2d_bwd_data/CMakeLists.txt        |   4 +-
 .../gpu/conv2d_bwd_weight/CMakeLists.txt      |   2 +-
 .../gpu/conv2d_fwd/CMakeLists.txt             |   4 +-
 .../gpu/conv2d_fwd_bias_relu/CMakeLists.txt   |   4 +-
 .../conv2d_fwd_bias_relu_add/CMakeLists.txt   |   4 +-
 .../CMakeLists.txt                            |   4 +-
 .../gpu/conv3d_fwd/CMakeLists.txt             |   3 +-
 .../gpu/convnd_bwd_data/CMakeLists.txt        |   2 +-
 .../gpu/device_conv2d.cpp                     | 201 ++++++++++++++++
 .../gpu/gemm/CMakeLists.txt                   |   3 +-
 .../gpu/gemm_bias2d/CMakeLists.txt            |   4 +-
 .../gpu/gemm_bias_relu/CMakeLists.txt         |   4 +-
 .../gpu/gemm_bias_relu_add/CMakeLists.txt     |   4 +-
 .../gpu/grouped_gemm/CMakeLists.txt           |   2 +-
 .../gpu/reduce/CMakeLists.txt                 |   4 +-
 .../include/profile_batched_gemm_impl.hpp     |   7 +-
 .../profile_batched_gemm_reduce_impl.hpp      |  30 +--
 .../include/profile_conv_bwd_data_impl.hpp    |   5 +-
 .../include/profile_conv_bwd_weight_impl.hpp  |  10 +-
 .../profile_conv_fwd_bias_relu_add_impl.hpp   |   5 +-
 ...ile_conv_fwd_bias_relu_atomic_add_impl.hpp |   5 +-
 .../profile_conv_fwd_bias_relu_impl.hpp       |   5 +-
 .../include/profile_convnd_bwd_data_impl.hpp  |   5 +-
 .../include/profile_gemm_bias_2d_impl.hpp     |   5 +-
 .../profile_gemm_bias_relu_add_impl.hpp       |   5 +-
 .../include/profile_gemm_bias_relu_impl.hpp   |   5 +-
 profiler/include/profile_gemm_impl.hpp        |   5 +-
 profiler/include/profile_gemm_reduce_impl.hpp |  32 +--
 .../include/profile_grouped_gemm_impl.hpp     |   5 +-
 profiler/include/profile_reduce_impl.hpp      |  15 +-
 profiler/src/profile_batched_gemm.cpp         |  38 ++--
 profiler/src/profile_batched_gemm_reduce.cpp  |  14 +-
 profiler/src/profile_conv_bwd_data.cpp        |  12 +-
 profiler/src/profile_conv_bwd_weight.cpp      |   6 +-
 profiler/src/profile_conv_fwd_bias_relu.cpp   |   6 +-
 .../src/profile_conv_fwd_bias_relu_add.cpp    |   6 +-
 .../profile_conv_fwd_bias_relu_atomic_add.cpp |   6 +-
 profiler/src/profile_convnd_bwd_data.cpp      |  10 +-
 profiler/src/profile_convnd_fwd.cpp           |  32 +--
 profiler/src/profile_gemm.cpp                 |  38 ++--
 profiler/src/profile_gemm_bias_2d.cpp         |  22 +-
 profiler/src/profile_gemm_bias_relu.cpp       |  14 +-
 profiler/src/profile_gemm_bias_relu_add.cpp   |  14 +-
 profiler/src/profile_gemm_reduce.cpp          |  14 +-
 profiler/src/profile_grouped_gemm.cpp         |  14 +-
 profiler/src/profile_reduce.cpp               |  20 +-
 test/CMakeLists.txt                           |   5 +-
 .../batched_gemm_reduce_fp16.cpp              |   8 +-
 test/client_app/CMakeLists.txt                |  11 +
 test/client_app/client_app.cpp                |  77 +++++++
 test/client_app/client_app_impl.hpp           | 214 ++++++++++++++++++
 test/conv2d_bwd_weight/conv2d_bwd_weight.cpp  |  32 +--
 test/convnd_bwd_data/convnd_bwd_data.cpp      |  96 ++++----
 test/gemm_reduce/gemm_reduce_fp16.cpp         |   8 +-
 test/gemm_split_k/gemm_split_k.cpp            |   3 +-
 131 files changed, 1666 insertions(+), 1089 deletions(-)
 create mode 100644 Config.cmake.in
 delete mode 100644 include/ck/hip_version.hpp.in
 create mode 100644 include/ck/options.hpp.in
 create mode 100644 include/ck/stream_config.hpp
 create mode 100644 library/include/ck/library/host/host_interface.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/device_conv2d.cpp
 create mode 100644 test/client_app/CMakeLists.txt
 create mode 100644 test/client_app/client_app.cpp
 create mode 100644 test/client_app/client_app_impl.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b798e38f3..f18c85c683 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
 
+option(CK_TIME_KERNEL "Turning off will disable kernel timing globally" ON)
+
 ## OpenMP
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	# workaround issue hipcc in rocm3.5 cannot find openmp
@@ -72,8 +74,9 @@ message(STATUS "Build with HIP ${HIP_VERSION}")
 
 
 rocm_create_package(
-    NAME CK-${CK_BACKEND}
+    NAME composablekernel
     DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
+    MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
     LDCONFIG
 )
 
@@ -226,7 +229,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 
-configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp")
+configure_file("${PROJECT_SOURCE_DIR}/include/ck/options.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/options.hpp")
 
 include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/include
@@ -234,7 +237,6 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/library/include
 )
 
-include(googletest)
 
 SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
 if(BUILD_DEV)
@@ -247,3 +249,25 @@ add_subdirectory(library)
 add_subdirectory(example)
 add_subdirectory(test)
 add_subdirectory(profiler)
+
+#Create an interface target for the include only files and call it "composablekernels"
+include(CMakePackageConfigHelpers)
+
+set(version 1.0.0)
+write_basic_package_version_file(
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+    VERSION "${version}"
+    COMPATIBILITY AnyNewerVersion
+)
+
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in 
+        "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+        INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel 
+        NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+
+install(FILES 
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel 
+)
diff --git a/Config.cmake.in b/Config.cmake.in
new file mode 100644
index 0000000000..12b5c331ae
--- /dev/null
+++ b/Config.cmake.in
@@ -0,0 +1,11 @@
+@PACKAGE_INIT@
+
+set(_composable_kernel_supported_components device_operations host_tensor)
+
+foreach(_comp ${composable_kernel_FIND_COMPONENTS})
+	if(NOT _comp IN_LIST _composable_kernel_supported_components)
+		set(composable_kernel_FOUND False)
+		set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}")
+	endif()
+	include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake")
+endforeach()
diff --git a/Dockerfile b/Dockerfile
index c4cf0fac57..9a443e01de 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,13 +11,7 @@ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 RUN apt-get update
 RUN apt-get install -y wget gnupg
 RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN if ! [ -z $OSDB_BKC_VERSION ]; then \
-       echo "Using BKC VERISION: $OSDB_BKC_VERSION";\
-       sh -c "echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-osdb-deb/ compute-rocm-dkms-no-npi-hipclang ${OSDB_BKC_VERSION} > /etc/apt/sources.list.d/rocm.list" ;\
-       cat  /etc/apt/sources.list.d/rocm.list;\
-    else \
-       sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" ;\
-    fi
+RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
 
@@ -25,18 +19,15 @@ RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/ap
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
     apt-utils \
-    sshpass \
     build-essential \
     cmake-data=3.15.1-0kitware1 \
     cmake=3.15.1-0kitware1 \
     curl \
-    doxygen \
     g++ \
     gdb \
     git \
     hip-rocclr \
     jq \
-    lcov \
     libelf-dev \
     libncurses5-dev \
     libnuma-dev \
@@ -62,8 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-# RUN pip3 install --default-timeout=100000 -r requirements.txt
-
 # Setup ubsan environment to printstacktrace
 RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
 ENV UBSAN_OPTIONS=print_stacktrace=1
@@ -92,5 +81,3 @@ ADD rbuild.ini /rbuild.ini
 ADD dev-requirements.txt dev-requirements.txt
 RUN rbuild prepare -s develop -d $PREFIX
 RUN groupadd -f render
-# RUN cget install -f min-requirements.txt
-# RUN CXXFLAGS='-isystem $PREFIX/include' cget install -f ./mlir-requirements.txt
diff --git a/Jenkinsfile b/Jenkinsfile
index f065d4ecc5..77f4d9d8be 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -320,7 +320,7 @@ pipeline {
                 {
                     agent{ label rocmnode("gfx908")}
                     environment{
-                        setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
+                        setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906  --offload-arch=gfx908 --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
@@ -341,6 +341,23 @@ pipeline {
 
             }
         }
+        stage("Client App")
+        {
+            parallel
+            {
+                stage("Run Client App")
+                {
+                    agent{ label rocmnode("gfx908")}
+                    environment{
+                        setup_args = """ -D  -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " """
+                        execute_args = """ cd ../test/client_app && rm -rf build && mkdir build && cd build && cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" .. && make  """ 
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                    }
+                }
+            }
+        }
         stage("Performance Tests")
         {
             parallel
diff --git a/README.md b/README.md
index f5341b5736..9d7b578046 100644
--- a/README.md
+++ b/README.md
@@ -43,3 +43,13 @@ Instructions for running each individual examples are under ```example/```
  make -j ckProfiler
 ```
 Instructions for running ckProfiler are under ```profiler/```
+
+
+## Caveat
+### Kernel Timing and Verification
+CK's own kernel timer will warn up kernel once, and then run it multiple times
+to get average kernel time. For some kernels that use atomic add, this will cause
+output buffer to be accumulated multiple times, causing verfication failure.
+To work around it, do not use CK's own timer and do verification at the same time.
+CK's own timer and verification in each example and ckProfiler can be enabled or
+disabled from command line.
diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
index f869ba483e..959bc4f4b0 100644
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -19,6 +19,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
      -Wno-zero-as-null-pointer-constant
      -Wno-unused-member-function
      -Wno-comma
+     -Wno-old-style-cast
 )
 message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")
 
@@ -35,4 +36,4 @@ FetchContent_MakeAvailable(googletest)
 target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
 target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
 target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
-
+target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
index a4567dcd6e..4077a4f8d8 100644
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -88,9 +88,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -105,13 +105,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -125,7 +125,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -198,7 +198,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp
index fc04a13ca5..4f0228eafe 100644
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -56,9 +56,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -73,13 +73,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -93,7 +93,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -171,7 +171,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp
index ab5869db61..d5bf4a8bde 100644
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -120,7 +120,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -194,7 +194,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
diff --git a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
index 2abebbbac4..451200e798 100644
--- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
+++ b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
@@ -86,9 +86,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBias2D<AD
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -106,13 +106,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 6)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         alpha = std::stof(argv[4]);
         beta  = std::stof(argv[5]);
@@ -121,7 +121,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -138,7 +138,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, alpha, beta\n");
         exit(0);
     }
@@ -216,7 +216,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
diff --git a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
index f3ed2bad37..308d423ce7 100644
--- a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
+++ b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActiv
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -120,7 +120,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -206,7 +206,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
 
diff --git a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
index 9405c36881..012fd21341 100644
--- a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
+++ b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance =
                                                                CElementOp>;
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -101,13 +101,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 11)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -122,7 +122,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, StrideC1\n");
         exit(0);
     }
@@ -218,7 +218,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop      = std::size_t(2) * M * N * K;
     std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
diff --git a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
index 53095bde0d..342de268e3 100644
--- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
@@ -93,7 +93,7 @@ void PrintUseMsg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "Following arguments:\n"
               << " N, K, C, \n"
               << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
@@ -165,9 +165,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::utils::conv;
 
-    bool do_verification      = 0;
-    int init_method           = 0;
-    int nrepeat               = 5;
+    bool do_verification      = true;
+    int init_method           = 1;
+    bool time_kernel          = false;
     const int num_dim_spatial = 2;
 
     ck::utils::conv::ConvParams params;
@@ -176,7 +176,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
 
     if(argc >= 5)
@@ -269,7 +269,7 @@ int main(int argc, char* argv[])
             "not support this problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = get_flops(
         params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
index c2b4ca0b5d..ff4fc66cb8 100644
--- a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+++ b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
@@ -90,7 +90,7 @@ void PrintUseMsg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "Following arguments:\n"
               << " N, K, C, \n"
               << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
@@ -162,9 +162,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::utils::conv;
 
-    bool do_verification      = 0;
-    int init_method           = 0;
-    int nrepeat               = 5;
+    bool do_verification      = true;
+    int init_method           = 1;
+    bool time_kernel          = false;
     const int num_dim_spatial = 2;
 
     ck::utils::conv::ConvParams params;
@@ -173,7 +173,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
 
     if(argc >= 5)
@@ -280,7 +280,7 @@ int main(int argc, char* argv[])
             "not support this problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = get_flops(
         params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl.cpp b/example/09_convnd_fwd/convnd_fwd_xdl.cpp
index 71f49b5e71..112d606f56 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl.cpp
@@ -107,7 +107,7 @@ void print_use_msg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "arg4: N spatial dimensions (default 2)\n"
               << "Following arguments (depending on number of spatial dims):\n"
               << " N, K, C, \n"
@@ -179,9 +179,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::utils::conv;
 
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
     int num_dim_spatial  = 2;
 
     ck::utils::conv::ConvParams params;
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         num_dim_spatial = std::stoi(argv[4]);
     }
 
@@ -276,7 +276,7 @@ int main(int argc, char* argv[])
             "not support this Conv problem");
     }
 
-    float ave_time = invoker->Run(argument.get(), nrepeat);
+    float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = get_flops(
         params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
index c1361a8db3..8b658e7790 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -110,7 +110,7 @@ void print_use_msg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "arg4: N spatial dimensions (default 2)\n"
               << "Following arguments (depending on number of spatial dims):\n"
               << " N, K, C, \n"
@@ -182,9 +182,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::utils::conv;
 
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
     int num_dim_spatial  = 2;
 
     ck::utils::conv::ConvParams params;
@@ -193,7 +193,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         num_dim_spatial = std::stoi(argv[4]);
     }
 
@@ -277,7 +277,7 @@ int main(int argc, char* argv[])
             "not support this Conv problem");
     }
 
-    float ave_time = invoker->Run(argument.get(), nrepeat);
+    float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = get_flops(
         params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
index 3d3e34dfd9..e7988d8683 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -112,7 +112,7 @@ void print_use_msg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "arg4: N spatial dimensions (default 2)\n"
               << "Following arguments (depending on number of spatial dims):\n"
               << " N, K, C, \n"
@@ -184,9 +184,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::utils::conv;
 
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
     int num_dim_spatial  = 2;
 
     ck::utils::conv::ConvParams params;
@@ -195,7 +195,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         num_dim_spatial = std::stoi(argv[4]);
     }
 
@@ -279,7 +279,7 @@ int main(int argc, char* argv[])
             "not support this Conv problem");
     }
 
-    float ave_time = invoker->Run(argument.get(), nrepeat);
+    float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = get_flops(
         params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
index f3f9b497f5..73210fa543 100644
--- a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+++ b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
@@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // Conv shape
     ck::index_t N               = 128;
@@ -102,13 +102,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 19)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         N               = std::stoi(argv[4]);
         K               = std::stoi(argv[5]);
@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(0);
@@ -214,7 +214,7 @@ int main(int argc, char* argv[])
             "not support this Conv problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
diff --git a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
index bf78cc87e0..0c996dc21b 100644
--- a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+++ b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
@@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance =
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
     int do_log           = 0;
     int split_k          = 4;
 
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         do_log          = std::stoi(argv[4]);
         split_k         = std::stoi(argv[5]);
     }
@@ -117,7 +117,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         do_log          = std::stoi(argv[4]);
         split_k         = std::stoi(argv[5]);
 
@@ -141,7 +141,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4: is show log (0=no, 1=yes)\n");
         printf("arg5: split-k \n");
         printf("arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
@@ -246,7 +246,7 @@ int main(int argc, char* argv[])
         return 1;
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index 7ca9823ff5..caa93c9df2 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -116,10 +116,9 @@ class SimpleAppArgs
     std::vector<size_t> inLengths;
     std::vector<float> scales;
 
-    bool do_verification = false;
-
-    int init_method = 1;
-    int nrepeat     = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     public:
     void show_usage(const char* cmd)
@@ -135,7 +134,7 @@ class SimpleAppArgs
         std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
                      "value, 3=decimal value)"
                   << std::endl;
-        std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl;
+        std::cout << "Arg2 -- time kernel (0=n0, 1=yes)" << std::endl;
     };
 
     int processArgs(int argc, char* argv[])
@@ -182,7 +181,7 @@ class SimpleAppArgs
             throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
 
         init_method = std::atoi(argv[optind++]);
-        nrepeat     = std::atoi(argv[optind]);
+        time_kernel = std::atoi(argv[optind]);
 
         if(scales.empty())
         {
@@ -352,7 +351,7 @@ int main(int argc, char* argv[])
 
     auto invoker_ptr = reduce.MakeInvokerPointer();
 
-    float avg_time = invoker_ptr->Run(argument_ptr.get(), args.nrepeat);
+    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel});
 
     std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
                             invariant_total_length * sizeof(OutDataType);
diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp
index a18761095c..f4eb9d79f6 100644
--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd.cpp
@@ -149,9 +149,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::host_reduce;
 
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // Pool shape
     ck::index_t N               = 128;
@@ -171,13 +171,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 16)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         N               = std::stoi(argv[4]);
         C               = std::stoi(argv[5]);
@@ -196,7 +196,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(0);
@@ -271,7 +271,7 @@ int main(int argc, char* argv[])
                                  "not support this problem");
     }
 
-    float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+    float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X;
 
diff --git a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
index 324dc35d3f..9fc63308b7 100644
--- a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
+++ b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
@@ -105,9 +105,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -125,13 +125,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -145,7 +145,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -219,7 +219,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
index 29ef01f2ef..f55db1d45c 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -60,21 +60,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     if(argc == 4)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         exit(0);
     }
 
@@ -202,7 +202,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
 
diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
index 90064ae584..8fea54f635 100644
--- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
@@ -58,9 +58,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 1;
+    bool do_verification = true;
     int init_method      = 1;
-    int nrepeat          = 5;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -79,13 +79,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -99,7 +99,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -192,30 +192,13 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    // warm up
-    invoker.Run(argument);
+    // init DO, D1 to 0
+    d0_device_buf.SetZero();
+    d1_device_buf.SetZero();
 
-    // timing
-    float total_time = 0;
-
-    for(int i = 0; i < nrepeat; ++i)
-    {
-        // init DO, D1 to 0
-        d0_device_buf.SetZero();
-        d1_device_buf.SetZero();
-
-        KernelTimer timer;
-
-        timer.Start();
-
-        invoker.Run(argument);
-
-        timer.End();
-
-        total_time += timer.GetElapsedTime();
-    }
-
-    float ave_time = total_time / nrepeat;
+    // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
+    // will not be correct. need to set time_kernel = false for correctness test
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
diff --git a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
index 1b375ea339..a013f39827 100644
--- a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
+++ b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
@@ -87,7 +87,7 @@ void print_use_msg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=random value, 2= init to 1 )\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "arg4: N spatial dimensions (default 2)\n"
               << "Following arguments (depending on number of spatial dims):\n"
               << " N, K, C, \n"
@@ -165,9 +165,9 @@ DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial)
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
     int num_dim_spatial  = 2;
 
     ck::utils::conv::ConvParams params;
@@ -177,13 +177,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc > 4)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         num_dim_spatial = std::stoi(argv[4]);
         // check args number
         int conv_args     = 3 + num_dim_spatial * 6;
@@ -284,7 +284,7 @@ int main(int argc, char* argv[])
             "not support this Conv problem");
     }
 
-    float ave_time = invoker->Run(argument.get(), nrepeat);
+    float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = ck::utils::conv::get_flops(
         params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
index eb18655d1b..f620ee1b20 100644
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -57,9 +57,9 @@ using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 1;
+    bool do_verification = true;
     int init_method      = 1;
-    int nrepeat          = 5;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -80,13 +80,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 11)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -102,7 +102,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, BatchCount\n");
         exit(0);
     }
@@ -204,30 +204,13 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    // warm up
-    invoker.Run(argument);
+    // init DO, D1 to 0
+    d0_device_buf.SetZero();
+    d1_device_buf.SetZero();
 
-    // timing
-    float total_time = 0;
-
-    for(int i = 0; i < nrepeat; ++i)
-    {
-        // init DO, D1 to 0
-        d0_device_buf.SetZero();
-        d1_device_buf.SetZero();
-
-        KernelTimer timer;
-
-        timer.Start();
-
-        invoker.Run(argument);
-
-        timer.End();
-
-        total_time += timer.GetElapsedTime();
-    }
-
-    float ave_time = total_time / nrepeat;
+    // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
+    // will not be correct. need to set time_kernel = false for correctness test
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop      = std::size_t(2) * BatchCount * M * N * K;
     std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K +
diff --git a/include/ck/hip_version.hpp.in b/include/ck/hip_version.hpp.in
deleted file mode 100644
index 4290ef7e0d..0000000000
--- a/include/ck/hip_version.hpp.in
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-// "_PACKAGE_" to avoid name contentions: the macros like
-// HIP_VERSION_MAJOR are defined in HIP_VERSION.h.
-// clang-format off
-#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@
-#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@
-#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@
-// clang-format on
-
-#ifndef CK_HIP_PACKAGE_VERSION_MAJOR
-#define CK_HIP_PACKAGE_VERSION_MAJOR 0
-#endif
-#ifndef CK_HIP_PACKAGE_VERSION_MINOR
-#define CK_HIP_PACKAGE_VERSION_MINOR 0
-#endif
-#ifndef CK_HIP_PACKAGE_VERSION_PATCH
-#define CK_HIP_PACKAGE_VERSION_PATCH 0
-#endif
-// 3 decimal digits for major and minor, 6 digits for patch number.
-// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math.
-#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \
-    CK_HIP_PACKAGE_VERSION_PATCH > 999999
-#error "Too big HIP version number(s)"
-#endif
-#define CK_HIP_PACKAGE_VERSION_FLAT                                                      \
-    ((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
-     CK_HIP_PACKAGE_VERSION_PATCH)
diff --git a/include/ck/options.hpp.in b/include/ck/options.hpp.in
new file mode 100644
index 0000000000..87ed6026a4
--- /dev/null
+++ b/include/ck/options.hpp.in
@@ -0,0 +1,3 @@
+#pragma once
+
+#cmakedefine01 CK_TIME_KERNEL
diff --git a/include/ck/stream_config.hpp b/include/ck/stream_config.hpp
new file mode 100644
index 0000000000..3e80b4c892
--- /dev/null
+++ b/include/ck/stream_config.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+
+struct StreamConfig
+{
+    hipStream_t stream_id_ = nullptr;
+    bool time_kernel_      = false;
+};
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
index cf48695ad0..950cfc1d61 100644
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -1,8 +1,9 @@
-#ifndef DEVICE_BASE_HPP
-#define DEVICE_BASE_HPP
+#pragma once
 
 #include <string>
 
+#include "stream_config.hpp"
+
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -22,7 +23,10 @@ struct BaseInvoker
     BaseInvoker(const BaseInvoker&) = default;
     BaseInvoker& operator=(const BaseInvoker&) = default;
 
-    virtual float Run(const BaseArgument*, int = 1) = 0;
+    virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{})
+    {
+        return float{0};
+    }
 
     virtual ~BaseInvoker() {}
 };
@@ -33,8 +37,8 @@ struct BaseOperator
     BaseOperator(const BaseOperator&) = default;
     BaseOperator& operator=(const BaseOperator&) = default;
 
-    virtual bool IsSupportedArgument(const BaseArgument*) = 0;
-    virtual std::string GetTypeString() const             = 0;
+    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
+    virtual std::string GetTypeString() const { return ""; }
 
     virtual ~BaseOperator() {}
 };
@@ -42,4 +46,3 @@ struct BaseOperator
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
index e1d354b344..a6408007ed 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -693,7 +693,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int /* nrepeat */ = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -729,6 +729,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
             const auto K =
                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
 
+            float elapsed_time = 0.0f;
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
                 const auto kernel = kernel_batched_gemm_reduce_xdl_cshuffle_v1<
@@ -748,26 +749,28 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                     remove_reference_t<Block2CTileMap>,
                     true>;
 
-                launch_kernel(kernel,
-                              dim3(grid_size),
-                              dim3(BlockSize),
-                              0,
-                              arg.p_a_grid_,
-                              arg.p_b_grid_,
-                              arg.p_c_grid_,
-                              arg.p_d0_grid_,
-                              arg.p_d1_grid_,
-                              arg.BatchCount_,
-                              arg.a_element_op_,
-                              arg.b_element_op_,
-                              arg.c_element_op_,
-                              arg.d1_element_op_,
-                              arg.a_grid_desc_ak0_m_ak1_,
-                              arg.b_grid_desc_bk0_n_bk1_,
-                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                              arg.d_grid_desc_mblock_mperblock_,
-                              arg.compute_base_ptr_of_batch_,
-                              arg.block_2_ctile_map_);
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_d0_grid_,
+                                           arg.p_d1_grid_,
+                                           arg.BatchCount_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.d1_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.d_grid_desc_mblock_mperblock_,
+                                           arg.compute_base_ptr_of_batch_,
+                                           arg.block_2_ctile_map_);
             }
             else
             {
@@ -788,35 +791,38 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                     remove_reference_t<Block2CTileMap>,
                     false>;
 
-                launch_kernel(kernel,
-                              dim3(grid_size),
-                              dim3(BlockSize),
-                              0,
-                              arg.p_a_grid_,
-                              arg.p_b_grid_,
-                              arg.p_c_grid_,
-                              arg.p_d0_grid_,
-                              arg.p_d1_grid_,
-                              arg.BatchCount_,
-                              arg.a_element_op_,
-                              arg.b_element_op_,
-                              arg.c_element_op_,
-                              arg.d1_element_op_,
-                              arg.a_grid_desc_ak0_m_ak1_,
-                              arg.b_grid_desc_bk0_n_bk1_,
-                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                              arg.d_grid_desc_mblock_mperblock_,
-                              arg.compute_base_ptr_of_batch_,
-                              arg.block_2_ctile_map_);
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_d0_grid_,
+                                           arg.p_d1_grid_,
+                                           arg.BatchCount_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.d1_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.d_grid_desc_mblock_mperblock_,
+                                           arg.compute_base_ptr_of_batch_,
+                                           arg.block_2_ctile_map_);
             }
 
-            return 0;
+            return elapsed_time;
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
index 88974a5221..ea7704951e 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
@@ -428,7 +428,7 @@ struct DeviceBatchedGemmXdl
     {
         using Argument = DeviceBatchedGemmXdl::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             {
                 std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -477,8 +477,8 @@ struct DeviceBatchedGemmXdl
                     remove_reference_t<Block2CTileMap>,
                     true>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -511,8 +511,8 @@ struct DeviceBatchedGemmXdl
                     remove_reference_t<Block2CTileMap>,
                     false>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -534,9 +534,10 @@ struct DeviceBatchedGemmXdl
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index 466e6ad89f..c36227083c 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -415,9 +415,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                       << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
         }
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             ShowInfo(arg);
+
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
                                             arg.b_grid_desc_kbatch_k0_n_k1_,
                                             arg.c_grid_desc_m_n_,
@@ -437,49 +438,27 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
             float ave_time = 0;
 
             const auto Run = [&](const auto& kernel) {
-                if(nrepeat > 0)
-                {
-                    ave_time =
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_,
-                                               arg.p_b_grid_,
-                                               arg.p_c_grid_,
-                                               arg.a_grid_desc_kbatch_k0_m_k1_,
-                                               arg.b_grid_desc_kbatch_k0_n_k1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.block_2_ctile_map_);
-                }
+                hipGetErrorString(hipMemset(
+                    arg.p_c_grid_,
+                    0,
+                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+                        sizeof(CDataType)));
 
-                if(kbatch > 1 || nrepeat <= 0)
-                {
-                    hipGetErrorString(hipMemset(
-                        arg.p_c_grid_,
-                        0,
-                        arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
-                            sizeof(CDataType)));
-
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_,
-                                  arg.p_b_grid_,
-                                  arg.p_c_grid_,
-                                  arg.a_grid_desc_kbatch_k0_m_k1_,
-                                  arg.b_grid_desc_kbatch_k0_n_k1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.block_2_ctile_map_);
-                }
+                launch_and_time_kernel(stream_config,
+                                       kernel,
+                                       dim3(grid_size),
+                                       dim3(BlockSize),
+                                       0,
+                                       arg.p_a_grid_,
+                                       arg.p_b_grid_,
+                                       arg.p_c_grid_,
+                                       arg.a_grid_desc_kbatch_k0_m_k1_,
+                                       arg.b_grid_desc_kbatch_k0_n_k1_,
+                                       arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                       arg.a_element_op_,
+                                       arg.b_element_op_,
+                                       arg.c_element_op_,
+                                       arg.block_2_ctile_map_);
             };
 
             if(has_main_k0_block_loop)
@@ -560,9 +539,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index fad4ec1ffa..def6af74ac 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -531,7 +531,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
             for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
@@ -602,8 +602,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         true>;
 
                     ave_time += launch_and_time_kernel(
+                        stream_config,
                         kernel,
-                        nrepeat,
                         dim3(grid_size),
                         dim3(BlockSize),
                         0,
@@ -635,8 +635,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         false>;
 
                     ave_time += launch_and_time_kernel(
+                        stream_config,
                         kernel,
-                        nrepeat,
                         dim3(grid_size),
                         dim3(BlockSize),
                         0,
@@ -655,9 +655,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
index 6648929cd5..fd95c184ca 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -642,7 +642,7 @@ struct
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -727,8 +727,8 @@ struct
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -771,8 +771,8 @@ struct
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -795,9 +795,10 @@ struct
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
index fd0941420c..61c91c0b76 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -605,7 +605,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -684,8 +684,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -723,8 +723,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -745,9 +745,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index b508606a75..f4cddc1946 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -568,7 +568,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -663,8 +663,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -697,8 +697,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -717,9 +717,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 3574f7667e..aa9229f7cb 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -450,7 +450,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -498,8 +498,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     true>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -529,8 +529,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     false>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -549,9 +549,10 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
index 1bfe0bb256..b1eea0b33f 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
@@ -92,7 +92,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto naive_conv3d_fwd =
                 ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
@@ -103,8 +103,8 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
                                                       WeiElementwiseOperation,
                                                       OutElementwiseOperation>;
 
-            float ave_time = launch_and_time_kernel(naive_conv3d_fwd,
-                                                    nrepeat,
+            float ave_time = launch_and_time_kernel(stream_config,
+                                                    naive_conv3d_fwd,
                                                     dim3(256),
                                                     dim3(256),
                                                     0,
@@ -137,9 +137,10 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index ff30a6880d..0f98ba054d 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -438,7 +438,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             {
                 std::cout << "num_batches_of_GEMM = " << arg.num_subbatches_ << std::endl;
@@ -487,8 +487,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
                     OutElementwiseOperation,
                     remove_reference_t<Block2CTileMap>,
                     true>;
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -522,8 +522,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
                     remove_reference_t<Block2CTileMap>,
                     false>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -547,9 +547,10 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
index 5dca8f9629..209b3c866e 100644
--- a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -1241,7 +1241,7 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
             for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
@@ -1316,8 +1316,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
                         true>;
 
                     ave_time += launch_and_time_kernel(
+                        stream_config,
                         kernel,
-                        nrepeat,
                         dim3(grid_size),
                         dim3(BlockSize),
                         0,
@@ -1349,8 +1349,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
                         false>;
 
                     ave_time += launch_and_time_kernel(
+                        stream_config,
                         kernel,
-                        nrepeat,
                         dim3(grid_size),
                         dim3(BlockSize),
                         0,
@@ -1369,9 +1369,10 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 7365f9a3e2..4251052a99 100644
--- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -747,7 +747,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -795,8 +795,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     true>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -826,8 +826,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     false>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -846,9 +846,10 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
index daa309888f..69c29b72d3 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
@@ -503,7 +503,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int /* nrepeat */ = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -536,6 +536,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
             const auto K =
                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
 
+            float elapsed_time = 0.0f;
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
                 const auto kernel = kernel_gemm_reduce_xdl_cshuffle_v1<
@@ -554,24 +555,26 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     true>;
 
-                launch_kernel(kernel,
-                              dim3(grid_size),
-                              dim3(BlockSize),
-                              0,
-                              arg.p_a_grid_,
-                              arg.p_b_grid_,
-                              arg.p_c_grid_,
-                              arg.p_d0_grid_,
-                              arg.p_d1_grid_,
-                              arg.a_element_op_,
-                              arg.b_element_op_,
-                              arg.c_element_op_,
-                              arg.d1_element_op_,
-                              arg.a_grid_desc_ak0_m_ak1_,
-                              arg.b_grid_desc_bk0_n_bk1_,
-                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                              arg.d_grid_desc_mblock_mperblock_,
-                              arg.block_2_ctile_map_);
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_d0_grid_,
+                                           arg.p_d1_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.d1_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.d_grid_desc_mblock_mperblock_,
+                                           arg.block_2_ctile_map_);
             }
             else
             {
@@ -591,33 +594,36 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     false>;
 
-                launch_kernel(kernel,
-                              dim3(grid_size),
-                              dim3(BlockSize),
-                              0,
-                              arg.p_a_grid_,
-                              arg.p_b_grid_,
-                              arg.p_c_grid_,
-                              arg.p_d0_grid_,
-                              arg.p_d1_grid_,
-                              arg.a_element_op_,
-                              arg.b_element_op_,
-                              arg.c_element_op_,
-                              arg.d1_element_op_,
-                              arg.a_grid_desc_ak0_m_ak1_,
-                              arg.b_grid_desc_bk0_n_bk1_,
-                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                              arg.d_grid_desc_mblock_mperblock_,
-                              arg.block_2_ctile_map_);
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_d0_grid_,
+                                           arg.p_d1_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.d1_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.d_grid_desc_mblock_mperblock_,
+                                           arg.block_2_ctile_map_);
             }
 
-            return 0;
+            return elapsed_time;
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
index 47997cd802..2bb7f6e78a 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
@@ -290,7 +290,7 @@ struct DeviceGemmXdl
     {
         using Argument = DeviceGemmXdl::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -339,8 +339,8 @@ struct DeviceGemmXdl
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     true>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -370,8 +370,8 @@ struct DeviceGemmXdl
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     false>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -391,9 +391,10 @@ struct DeviceGemmXdl
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp
index 4010965312..315f39d9bf 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp
@@ -264,7 +264,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
     {
         using Argument = DeviceGemmXdl_C_Shuffle_Bias_2d::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             {
                 std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -320,8 +320,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -359,8 +359,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -382,9 +382,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
index c65ff6022a..f1f9f41724 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
@@ -273,7 +273,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             {
                 std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -329,8 +329,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -368,8 +368,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -391,9 +391,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
index 4a478c995d..e3d0986aba 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
@@ -312,7 +312,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             {
                 std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -374,8 +374,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -418,8 +418,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -443,9 +443,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
index fde27acdb1..952630120a 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
@@ -440,7 +440,7 @@ struct DeviceGemm_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -487,42 +487,22 @@ struct DeviceGemm_Xdl_CShuffle
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     true>;
 
-                if(nrepeat == 0)
-                {
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_,
-                                  arg.p_b_grid_,
-                                  arg.p_c_grid_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
-                }
-                else
-                {
-                    ave_time =
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_,
-                                               arg.p_b_grid_,
-                                               arg.p_c_grid_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-                }
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
             }
             else
             {
@@ -538,52 +518,32 @@ struct DeviceGemm_Xdl_CShuffle
                     typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     false>;
-
-                if(nrepeat == 0)
-                {
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_,
-                                  arg.p_b_grid_,
-                                  arg.p_c_grid_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
-                }
-                else
-                {
-                    ave_time =
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_,
-                                               arg.p_b_grid_,
-                                               arg.p_c_grid_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-                }
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
             }
 
             return ave_time;
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
index db6c884739..e603af1fba 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
@@ -385,8 +385,11 @@ struct DeviceGemmXdlSplitK
             std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
                       << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
         }
-        float Run(const Argument& arg, int nrepeat = 1)
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
+            ShowInfo(arg);
+
             const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
 
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
@@ -408,50 +411,30 @@ struct DeviceGemmXdlSplitK
             float ave_time = 0;
 
             const auto Run = [&](const auto& kernel) {
-                if(nrepeat > 0)
-                {
-                    ShowInfo(arg);
-                    ave_time = launch_and_time_kernel(kernel,
-                                                      nrepeat,
-                                                      dim3(grid_size),
-                                                      dim3(BlockSize),
-                                                      0,
-                                                      arg.p_a_grid_,
-                                                      arg.p_b_grid_,
-                                                      arg.p_c_grid_,
-                                                      arg.a_grid_desc_kbatch_k0_m_k1_,
-                                                      arg.b_grid_desc_kbatch_k0_n_k1_,
-                                                      arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                      arg.a_element_op_,
-                                                      arg.b_element_op_,
-                                                      arg.c_element_op_,
-                                                      arg.block_2_ctile_map_);
-                }
+                // FIXME: this should be moved outside of DeviceOp
+                hipGetErrorString(
+                    hipMemset(arg.p_c_grid_,
+                              0,
+                              arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() *
+                                  sizeof(CDataType)));
 
-                if(kbatch > 1 || nrepeat <= 0)
-                {
-                    hipGetErrorString(
-                        hipMemset(arg.p_c_grid_,
-                                  0,
-                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() *
-                                      sizeof(CDataType)));
-
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_,
-                                  arg.p_b_grid_,
-                                  arg.p_c_grid_,
-                                  arg.a_grid_desc_kbatch_k0_m_k1_,
-                                  arg.b_grid_desc_kbatch_k0_n_k1_,
-                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.block_2_ctile_map_);
-                }
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_kbatch_k0_m_k1_,
+                                                  arg.b_grid_desc_kbatch_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.block_2_ctile_map_);
             };
+
             if(has_main_k0_block_loop)
             {
                 if(kbatch == 1)
@@ -531,9 +514,10 @@ struct DeviceGemmXdlSplitK
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
index 9de5361ab6..7d00224429 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -391,8 +391,11 @@ struct DeviceGemmXdlSplitKCShuffle
             std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
                       << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
         }
-        float Run(const Argument& arg, int nrepeat = 1)
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
+            ShowInfo(arg);
+
             const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
 
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
@@ -414,51 +417,29 @@ struct DeviceGemmXdlSplitKCShuffle
             float ave_time = 0;
 
             const auto Run = [&](const auto& kernel) {
-                if(nrepeat > 0)
-                {
-                    ShowInfo(arg);
-                    ave_time =
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_,
-                                               arg.p_b_grid_,
-                                               arg.p_c_grid_,
-                                               arg.a_grid_desc_kbatch_k0_m_k1_,
-                                               arg.b_grid_desc_kbatch_k0_n_k1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.block_2_ctile_map_);
-                }
+                hipGetErrorString(hipMemset(
+                    arg.p_c_grid_,
+                    0,
+                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+                        sizeof(CDataType)));
 
-                if(kbatch > 1 || nrepeat <= 0)
-                {
-                    hipGetErrorString(hipMemset(
-                        arg.p_c_grid_,
-                        0,
-                        arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
-                            sizeof(CDataType)));
-
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_,
-                                  arg.p_b_grid_,
-                                  arg.p_c_grid_,
-                                  arg.a_grid_desc_kbatch_k0_m_k1_,
-                                  arg.b_grid_desc_kbatch_k0_n_k1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.block_2_ctile_map_);
-                }
+                launch_and_time_kernel(stream_config,
+                                       kernel,
+                                       dim3(grid_size),
+                                       dim3(BlockSize),
+                                       0,
+                                       arg.p_a_grid_,
+                                       arg.p_b_grid_,
+                                       arg.p_c_grid_,
+                                       arg.a_grid_desc_kbatch_k0_m_k1_,
+                                       arg.b_grid_desc_kbatch_k0_n_k1_,
+                                       arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                       arg.a_element_op_,
+                                       arg.b_element_op_,
+                                       arg.c_element_op_,
+                                       arg.block_2_ctile_map_);
             };
+
             if(has_main_k0_block_loop)
             {
                 if(kbatch == 1)
@@ -542,9 +523,10 @@ struct DeviceGemmXdlSplitKCShuffle
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
index dfc1ce2715..730b2d787e 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
@@ -449,7 +449,7 @@ struct DeviceGroupedGemmXdl
     {
         using Argument = DeviceGroupedGemmXdl::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             StaticallyIndexedArray<GemmDescKernelArg, MaxGroupCount> gemm_desc_kernel_args;
 
@@ -510,8 +510,8 @@ struct DeviceGroupedGemmXdl
                                                     true,
                                                     MaxGroupCount>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(arg.grid_size_),
                                                   dim3(BlockSize),
                                                   0,
@@ -534,8 +534,8 @@ struct DeviceGroupedGemmXdl
                                                     false,
                                                     MaxGroupCount>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(arg.grid_size_),
                                                   dim3(BlockSize),
                                                   0,
@@ -550,9 +550,10 @@ struct DeviceGroupedGemmXdl
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
index 651d31ae2f..f665378e08 100644
--- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -204,7 +204,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
                                                                          OutDataType,
@@ -241,8 +241,8 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
 
             const index_t grid_size = (ReduceM / ReduceM_BlockTileSize);
 
-            return launch_and_time_kernel(kernel,
-                                          nrepeat,
+            return launch_and_time_kernel(stream_config,
+                                          kernel,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
@@ -257,9 +257,10 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
                                           arg.p_out_indices_dev_);
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
index 4f17989b53..860f53d8c5 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
@@ -211,7 +211,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto in_grid_desc_m_k =
                 DeviceReduceBlockWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
@@ -253,8 +253,8 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
                                                         InElementwiseOperation,
                                                         AccElementwiseOperation>;
 
-            avg_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
+            avg_time = launch_and_time_kernel(stream_config,
+                                              kernel,
                                               dim3(arg.gridSize),
                                               dim3(BlockSize),
                                               0,
@@ -272,9 +272,10 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
             return (avg_time);
         };
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         };
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
index d3b1b4b5c3..43ac48cecc 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
@@ -182,7 +182,7 @@ struct DeviceReduceBlockWiseSecondCall
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor(
                 arg.inLengths_, arg.inStrides_);
@@ -224,8 +224,8 @@ struct DeviceReduceBlockWiseSecondCall
                                                                     InElementwiseOperation,
                                                                     AccElementwiseOperation>;
 
-            avg_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
+            avg_time = launch_and_time_kernel(stream_config,
+                                              kernel,
                                               dim3(arg.gridSize),
                                               dim3(BlockSize),
                                               0,
@@ -243,10 +243,11 @@ struct DeviceReduceBlockWiseSecondCall
             return (avg_time);
         };
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
-        };
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
     };
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
index 889c366875..f93c65fe18 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
@@ -245,7 +245,7 @@ struct DeviceReduceMultiBlockAtomicAdd
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto in_grid_desc_m_k = DeviceReduceMultiBlockAtomicAdd::MakeSrc2dDescriptor(
                 arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
@@ -275,8 +275,6 @@ struct DeviceReduceMultiBlockAtomicAdd
 
             float avg_time = 0;
 
-            KernelTimer timer;
-
             const auto kernel_pre  = kernel_buffer_set_value<BlockSize, OutDataType, OutGridDesc_M>;
             const auto kernel_main = kernel_reduce_multiblock_atocmi_add<GridwiseReduce,
                                                                          InDataType,
@@ -287,50 +285,38 @@ struct DeviceReduceMultiBlockAtomicAdd
                                                                          InElementwiseOperation,
                                                                          AccElementwiseOperation>;
 
-            printf("launch_and_time_kernel: grid_dim {%ld, 1, 1}, block_dim {%d, 1, 1} \n",
-                   arg.gridSize,
-                   BlockSize);
-            printf("Warm up\n");
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_pre,
+                                               dim3(arg.gridSize_pre),
+                                               dim3(BlockSize),
+                                               0,
+                                               out_grid_desc_m,
+                                               arg.out_dev_,
+                                               static_cast<OutDataType>(0.0f));
 
-            for(int i = 0; i < nrepeat + 1; i++)
-            {
-                if(i == 1)
-                    timer.Start();
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize),
+                                               dim3(BlockSize),
+                                               0,
+                                               in_grid_desc_m_k,
+                                               out_grid_desc_m,
+                                               arg.in_elementwise_op_,
+                                               arg.acc_elementwise_op_,
+                                               arg.blkGroupSize,
+                                               arg.kBlockTileIterations,
+                                               arg.alpha_,
+                                               arg.in_dev_,
+                                               arg.out_dev_);
 
-                launch_kernel(kernel_pre,
-                              dim3(arg.gridSize_pre),
-                              dim3(BlockSize),
-                              0,
-                              out_grid_desc_m,
-                              arg.out_dev_,
-                              static_cast<OutDataType>(0.0f));
+            return avg_time;
+        }
 
-                launch_kernel(kernel_main,
-                              dim3(arg.gridSize),
-                              dim3(BlockSize),
-                              0,
-                              in_grid_desc_m_k,
-                              out_grid_desc_m,
-                              arg.in_elementwise_op_,
-                              arg.acc_elementwise_op_,
-                              arg.blkGroupSize,
-                              arg.kBlockTileIterations,
-                              arg.alpha_,
-                              arg.in_dev_,
-                              arg.out_dev_);
-            };
-
-            timer.End();
-
-            avg_time = timer.GetElapsedTime() / nrepeat;
-
-            return (avg_time);
-        };
-
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
-        };
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
     };
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
index d583f7f1b8..b4eb8116c2 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
@@ -273,7 +273,7 @@ struct DeviceReduceMultiBlockPartialReduce
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor(
                 arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
@@ -313,8 +313,8 @@ struct DeviceReduceMultiBlockPartialReduce
                                                                  InElementwiseOperation,
                                                                  AccElementwiseOperation>;
 
-            avg_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
+            avg_time = launch_and_time_kernel(stream_config,
+                                              kernel,
                                               dim3(arg.gridSize),
                                               dim3(BlockSize),
                                               0,
@@ -331,10 +331,11 @@ struct DeviceReduceMultiBlockPartialReduce
             return (avg_time);
         };
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
-        };
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
     };
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
index bf4088a96b..dacb175043 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
@@ -212,7 +212,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto in_grid_desc_m_k =
                 DeviceReduceThreadWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
@@ -254,8 +254,8 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
                                                          InElementwiseOperation,
                                                          OutElementwiseOperation>;
 
-            avg_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
+            avg_time = launch_and_time_kernel(stream_config,
+                                              kernel,
                                               dim3(arg.gridSize),
                                               dim3(BlockSize),
                                               0,
@@ -272,10 +272,11 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
             return (avg_time);
         };
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
-        };
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
     };
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/library/include/ck/library/host/host_interface.hpp b/library/include/ck/library/host/host_interface.hpp
new file mode 100644
index 0000000000..955da0f4be
--- /dev/null
+++ b/library/include/ck/library/host/host_interface.hpp
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "stream_config.hpp"
+#include "config.hpp"
+#include "device_base.hpp"
+
+struct DeviceConvFwdPtr_t
+{
+    using BaseArgument = ck::tensor_operation::device::BaseArgument;
+    using BaseInvoker  = ck::tensor_operation::device::BaseInvoker;
+
+    struct DeviceConvFwdPtrImpl;
+    std::unique_ptr<DeviceConvFwdPtrImpl> pImpl;
+    DeviceConvFwdPtr_t();
+    ~DeviceConvFwdPtr_t();
+    DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&);
+    DeviceConvFwdPtr_t(DeviceConvFwdPtrImpl&);
+    DeviceConvFwdPtr_t& operator=(DeviceConvFwdPtr_t&) = delete;
+    DeviceConvFwdPtr_t& operator=(const DeviceConvFwdPtr_t&) = delete;
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(void* in_ptr,
+                        void* wei_ptr,
+                        void* out_ptr,
+                        size_t N,
+                        size_t K,
+                        size_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads)
+        const; // in,wei and out element ops are ignored for now since even if we change them, they
+               // cant be linked
+    std::unique_ptr<BaseInvoker>
+    MakeInvokerPointer() const; // requires including BaseInvoker headers
+    std::string GetTypeString();
+    bool IsSupportedArgument(const BaseArgument* arg_ptr);
+};
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances);
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances);
diff --git a/library/include/ck/library/host_tensor/device.hpp b/library/include/ck/library/host_tensor/device.hpp
index f33b8d4f40..d549b14c8c 100644
--- a/library/include/ck/library/host_tensor/device.hpp
+++ b/library/include/ck/library/host_tensor/device.hpp
@@ -1,12 +1,25 @@
-#ifndef DEVICE_HPP
-#define DEVICE_HPP
+#pragma once
 
 #include <memory>
 #include <functional>
 #include <thread>
 #include <chrono>
-#include "hip/hip_runtime.h"
-#include "hip/hip_fp16.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+
+#include "stream_config.hpp"
+#include "ck/options.hpp"
+
+inline void hip_check_error(hipError_t x)
+{
+    if(x != hipSuccess)
+    {
+        std::ostringstream ss;
+        ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
+           << "in function: " << __func__;
+        throw std::runtime_error(ss.str());
+    }
+}
 
 struct DeviceMem
 {
@@ -36,49 +49,59 @@ struct KernelTimer
     std::unique_ptr<KernelTimerImpl> impl;
 };
 
-using device_stream_t = hipStream_t;
-
 template <typename... Args, typename F>
-void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
+float launch_and_time_kernel(const StreamConfig& stream_config,
+                             F kernel,
+                             dim3 grid_dim,
+                             dim3 block_dim,
+                             std::size_t lds_byte,
+                             Args... args)
 {
-    hipStream_t stream_id = nullptr;
-
-    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
-}
-
-template <typename... Args, typename F>
-float launch_and_time_kernel(
-    F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
-{
-    KernelTimer timer;
-
-    printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
-           __func__,
-           grid_dim.x,
-           grid_dim.y,
-           grid_dim.z,
-           block_dim.x,
-           block_dim.y,
-           block_dim.z);
-
-    printf("Warm up\n");
-
-    hipStream_t stream_id = nullptr;
-
-    // warm up
-    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
-
-    printf("Start running %d times...\n", nrepeat);
-
-    timer.Start();
-
-    for(int i = 0; i < nrepeat; ++i)
+#if CK_TIME_KERNEL
+    if(stream_config.time_kernel_)
     {
-        hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
+        printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
+               __func__,
+               grid_dim.x,
+               grid_dim.y,
+               grid_dim.z,
+               block_dim.x,
+               block_dim.y,
+               block_dim.z);
+
+        const int nrepeat = 10;
+
+        printf("Warm up 1 time\n");
+
+        // warm up
+        hipLaunchKernelGGL(
+            kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
+
+        printf("Start running %d times...\n", nrepeat);
+
+        KernelTimer timer;
+        timer.Start();
+
+        for(int i = 0; i < nrepeat; ++i)
+        {
+            hipLaunchKernelGGL(
+                kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
+        }
+
+        timer.End();
+
+        return timer.GetElapsedTime() / nrepeat;
     }
+    else
+    {
+        hipLaunchKernelGGL(
+            kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
 
-    timer.End();
+        return 0;
+    }
+#else
+    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
 
-    return timer.GetElapsedTime() / nrepeat;
-}
+    return 0;
 #endif
+}
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
index 3a706dac0b..f4944a28d2 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
@@ -84,7 +84,8 @@ struct ReferenceBatchedGemm : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
index c5f3cbad69..10619ae6d9 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
@@ -121,7 +121,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
index 9e91f06e7f..45fc8b8503 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
@@ -291,7 +291,8 @@ struct ReferenceConvBwdData : public device::BaseOperator
             }
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
index 65e59db2f8..d1afa898e4 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -1,9 +1,10 @@
-#ifndef REFERENCE_CONV_FWD_HPP
-#define REFERENCE_CONV_FWD_HPP
+#pragma once
 
 #include <iostream>
 #include <type_traits>
 #include <sstream>
+
+#include "stream_config.hpp"
 #include "device_base.hpp"
 #include "host_tensor.hpp"
 
@@ -251,7 +252,8 @@ struct ReferenceConvFwd : public device::BaseOperator
             }
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
@@ -311,4 +313,3 @@ struct ReferenceConvFwd : public device::BaseOperator
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck
-#endif
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
index ee95cd410a..4be6169c15 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
@@ -124,7 +124,8 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
index 11232cc98f..466537c686 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
@@ -130,7 +130,8 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index 1b49ca5740..d89c8f5e05 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -80,7 +80,8 @@ struct ReferenceGemm : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
index 7dd6fc9199..3e7f220e03 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
@@ -82,7 +82,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
index 7c9df272c2..60f72e9e51 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
@@ -85,7 +85,8 @@ struct ReferenceGemmBiasActivation : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
index 4d3c5effae..5e0ec75e5e 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
@@ -91,7 +91,8 @@ struct ReferenceGemmBiasActivationAdd : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/utility/op_instance_engine.hpp b/library/include/ck/library/utility/op_instance_engine.hpp
index ec88b4e1b9..5429f66d3e 100644
--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
@@ -128,7 +128,7 @@ class OpInstanceRunEngine
 
     template <typename OpInstancePtr>
     ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
-                              int nrepeat          = 100,
+                              bool time_kernel     = false,
                               bool do_verification = false,
                               bool do_log          = false)
     {
@@ -143,7 +143,7 @@ class OpInstanceRunEngine
             if(op_ptr->IsSupportedArgument(argument.get()))
             {
                 std::string op_name = op_ptr->GetTypeString();
-                float avg_time      = invoker->Run(argument.get(), nrepeat);
+                float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
                 std::size_t flops     = op_instance_.GetFlops();
                 std::size_t num_btype = op_instance_.GetBtype();
diff --git a/library/src/host_tensor/CMakeLists.txt b/library/src/host_tensor/CMakeLists.txt
index fd100e477f..2a020b763d 100644
--- a/library/src/host_tensor/CMakeLists.txt
+++ b/library/src/host_tensor/CMakeLists.txt
@@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE
     host_tensor.cpp
 )
 
-add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE})
+add_library(host_tensor STATIC ${HOST_TENSOR_SOURCE})
+add_library(composable_kernel::host_tensor ALIAS host_tensor)
+
 target_compile_features(host_tensor PUBLIC)
 set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-install(TARGETS host_tensor LIBRARY DESTINATION lib)
+
+target_include_directories(host_tensor PUBLIC 
+    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
+    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>"
+    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>"
+)
+
+install(TARGETS host_tensor 
+        EXPORT host_tensorTargets
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
+install(EXPORT host_tensorTargets
+    FILE composable_kernelhost_tensorTargets.cmake 
+    NAMESPACE composable_kernel::
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
 
 clang_tidy_check(host_tensor)
diff --git a/library/src/host_tensor/device.cpp b/library/src/host_tensor/device.cpp
index 3e80df80fb..9f0d982dbc 100644
--- a/library/src/host_tensor/device.cpp
+++ b/library/src/host_tensor/device.cpp
@@ -2,7 +2,7 @@
 
 DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
 {
-    hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+    hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
 }
 
 void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
@@ -11,49 +11,48 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
 
 void DeviceMem::ToDevice(const void* p)
 {
-    hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+    hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
 }
 
 void DeviceMem::FromDevice(void* p)
 {
-    hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+    hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
 }
 
-void DeviceMem::SetZero() { hipGetErrorString(hipMemset(mpDeviceBuf, 0, mMemSize)); }
+void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
 
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
 
 struct KernelTimerImpl
 {
     KernelTimerImpl()
     {
-        hipGetErrorString(hipEventCreate(&mStart));
-        hipGetErrorString(hipEventCreate(&mEnd));
+        hip_check_error(hipEventCreate(&mStart));
+        hip_check_error(hipEventCreate(&mEnd));
     }
 
     ~KernelTimerImpl()
     {
-        hipGetErrorString(hipEventDestroy(mStart));
-        hipGetErrorString(hipEventDestroy(mEnd));
+        hip_check_error(hipEventDestroy(mStart));
+        hip_check_error(hipEventDestroy(mEnd));
     }
 
     void Start()
     {
-        hipGetErrorString(hipDeviceSynchronize());
-        hipGetErrorString(hipEventRecord(mStart, nullptr));
+        hip_check_error(hipDeviceSynchronize());
+        hip_check_error(hipEventRecord(mStart, nullptr));
     }
 
     void End()
     {
-        hipGetErrorString(hipEventRecord(mEnd, nullptr));
-        hipGetErrorString(hipEventSynchronize(mEnd));
+        hip_check_error(hipEventRecord(mEnd, nullptr));
+        hip_check_error(hipEventSynchronize(mEnd));
     }
 
     float GetElapsedTime() const
     {
         float time;
-        hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
+        hip_check_error(hipEventElapsedTime(&time, mStart, mEnd));
         return time;
     }
 
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 7b361b48bd..5abfb0c074 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -11,6 +11,7 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
     ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
     ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host
     ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
     ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
     ${PROJECT_SOURCE_DIR}/external/include/half
@@ -18,7 +19,7 @@ include_directories(BEFORE
 
 function(add_instance_library INSTANCE_NAME)
     message("adding instance ${INSTANCE_NAME}")
-    add_library(${INSTANCE_NAME} SHARED ${ARGN}) 
+    add_library(${INSTANCE_NAME} OBJECT ${ARGN}) 
     target_compile_features(${INSTANCE_NAME} PUBLIC)
     set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endfunction(add_instance_library INSTANCE_NAME)
@@ -41,3 +42,73 @@ add_subdirectory(convnd_bwd_data)
 add_subdirectory(grouped_gemm)
 add_subdirectory(conv2d_bwd_weight)
 add_subdirectory(batched_gemm_reduce)
+
+add_library(device_operations STATIC 
+    $<TARGET_OBJECTS:device_conv1d_fwd_instance> 
+    $<TARGET_OBJECTS:device_batched_gemm_instance> 
+    $<TARGET_OBJECTS:device_conv2d_bwd_data_instance> 
+    $<TARGET_OBJECTS:device_conv2d_fwd_instance> 
+    $<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_instance> 
+    $<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_add_instance>
+    $<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_atomic_add_instance>
+    $<TARGET_OBJECTS:device_gemm_instance>
+    $<TARGET_OBJECTS:device_gemm_bias_relu_instance>
+    $<TARGET_OBJECTS:device_gemm_bias_relu_add_instance>
+    $<TARGET_OBJECTS:device_gemm_bias2d_instance>
+    $<TARGET_OBJECTS:device_reduce_instance>
+    $<TARGET_OBJECTS:device_convnd_bwd_data_instance>
+    $<TARGET_OBJECTS:device_grouped_gemm_instance>
+    $<TARGET_OBJECTS:device_conv2d_bwd_weight_instance>
+    $<TARGET_OBJECTS:device_batched_gemm_reduce_instance>
+    $<TARGET_OBJECTS:device_conv3d_fwd_instance>
+    device_conv2d.cpp
+)
+add_library(composablekernels::device_operations ALIAS device_operations)
+
+
+set(DEV_OPS_INC_DIRS 
+    ${PROJECT_SOURCE_DIR}/include/ck/
+    ${PROJECT_SOURCE_DIR}/library/include/ck/
+    ${PROJECT_SOURCE_DIR}/external/include/
+)
+target_compile_features(device_operations PUBLIC)
+set_target_properties(device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(device_operations PUBLIC 
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_description>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/problem_transform>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/grid>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/block>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/warp>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/thread>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/element>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/half>
+)
+
+#once new arches are enabled make this an option on the main cmake file
+# and pass down here to be exported
+
+target_compile_options(device_operations
+PRIVATE --offload-arch=gfx908
+)
+# install(TARGETS device_operations LIBRARY DESTINATION lib)
+install(TARGETS device_operations
+        EXPORT device_operationsTargets
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
+install(EXPORT device_operationsTargets 
+        FILE composable_kerneldevice_operationsTargets.cmake 
+        NAMESPACE composable_kernel::
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
index 35e24462b5..016c85f673 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
@@ -18,9 +18,9 @@ set(DEVICE_BATCHED_GEMM_INSTANCE_SOURCE
    device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp;
 )
 
-add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
-target_compile_features(device_batched_gemm_instance PUBLIC)
+add_library(device_batched_gemm_instance OBJECT ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
+# target_compile_features(device_batched_gemm_instance PUBLIC)
 set_target_properties(device_batched_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
+# install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
 
 clang_tidy_check(device_batched_gemm_instance)
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
index 59eb6cb1cc..67a3c15d00 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
@@ -5,7 +5,8 @@ set(DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE
     device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
 )
 
-add_instance_library(device_batched_gemm_reduce_instance ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE})
-install(TARGETS device_batched_gemm_reduce_instance LIBRARY DESTINATION lib)
+add_instance_library(device_batched_gemm_reduce_instance OBJECT ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE})
+target_compile_features(device_batched_gemm_reduce_instance PUBLIC)
+set_target_properties(device_batched_gemm_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
 clang_tidy_check(device_batched_gemm_reduce_instance)
 
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt
index 6c7c3e4f78..77aa6198f5 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt
@@ -6,9 +6,9 @@ set(DEVICE_CONV1D_FWD_INSTANCE_SOURCE
    device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp;
 )
 
-add_library(device_conv1d_fwd_instance SHARED ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE}) 
-target_compile_features(device_conv1d_fwd_instance PUBLIC)
+add_library(device_conv1d_fwd_instance OBJECT ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE}) 
+# target_compile_features(device_conv1d_fwd_instance PUBLIC)
 set_target_properties(device_conv1d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib) 
+# install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv1d_fwd_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
index d619ef4bf1..d7882a7d8b 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE
    device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
 ) 
 
-add_library(device_conv2d_bwd_data_instance SHARED ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})
-target_compile_features(device_conv2d_bwd_data_instance PUBLIC)
+add_library(device_conv2d_bwd_data_instance OBJECT ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})
 set_target_properties(device_conv2d_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_bwd_data_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv2d_bwd_data_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt
index 6183e70b9b..7c384a882b 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt
@@ -3,7 +3,7 @@ set(DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE
    device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp;
    device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp;
 )
-add_library(device_conv2d_bwd_weight_instance SHARED ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE}) 
+add_library(device_conv2d_bwd_weight_instance OBJECT ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE}) 
 target_compile_features(device_conv2d_bwd_weight_instance PUBLIC)
 set_target_properties(device_conv2d_bwd_weight_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
 install(TARGETS device_conv2d_bwd_weight_instance LIBRARY DESTINATION lib) 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
index 7483861524..857e36d6f5 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_CONV2D_FWD_INSTANCE_SOURCE
    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
    device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp;
 )
-add_library(device_conv2d_fwd_instance SHARED ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE}) 
-target_compile_features(device_conv2d_fwd_instance PUBLIC)
+add_library(device_conv2d_fwd_instance OBJECT ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE}) 
 set_target_properties(device_conv2d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv2d_fwd_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
index 27a9736a3f..ad66c73bf8 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
@@ -2,9 +2,7 @@
 set(DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE
    device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp;
 )
-add_library(device_conv2d_fwd_bias_relu_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE}) 
-target_compile_features(device_conv2d_fwd_bias_relu_instance PUBLIC)
+add_library(device_conv2d_fwd_bias_relu_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE}) 
 set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv2d_fwd_bias_relu_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
index d7bec82174..36b1f6c153 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
@@ -2,9 +2,7 @@
 set(DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE
    device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp;
 )
-add_library(device_conv2d_fwd_bias_relu_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE}) 
-target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC)
+add_library(device_conv2d_fwd_bias_relu_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE}) 
 set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv2d_fwd_bias_relu_add_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt
index c0942d5485..5906c7c5ac 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt
@@ -3,9 +3,7 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE
    device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp;
 )
 
-add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) 
-target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC)
+add_library(device_conv2d_fwd_bias_relu_atomic_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) 
 set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv2d_fwd_bias_relu_atomic_add_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt
index f6849a7bb2..91a299c742 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt
@@ -5,9 +5,8 @@ set(DEVICE_CONV3D_FWD_INSTANCE_SOURCE
    device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp;
    device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
 )
-add_library(device_conv3d_fwd_instance SHARED ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE}) 
+add_library(device_conv3d_fwd_instance OBJECT ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE}) 
 target_compile_features(device_conv3d_fwd_instance PUBLIC)
 set_target_properties(device_conv3d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv3d_fwd_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv3d_fwd_instance)
diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
index 9ee961ad74..037f860808 100644
--- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
@@ -14,7 +14,7 @@ set(DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE
    device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
 ) 
 
-add_library(device_convnd_bwd_data_instance SHARED ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE})
+add_library(device_convnd_bwd_data_instance OBJECT ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE})
 target_compile_features(device_convnd_bwd_data_instance PUBLIC)
 set_target_properties(device_convnd_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
 install(TARGETS device_convnd_bwd_data_instance LIBRARY DESTINATION lib) 
diff --git a/library/src/tensor_operation_instance/gpu/device_conv2d.cpp b/library/src/tensor_operation_instance/gpu/device_conv2d.cpp
new file mode 100644
index 0000000000..6b99433ffa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/device_conv2d.cpp
@@ -0,0 +1,201 @@
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+#include "host_interface.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_fwd_instance {
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+} // namespace device_conv2d_fwd_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+struct DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl
+{
+    std::unique_ptr<DeviceConvFwdPtr_t::BaseArgument>
+    MakeArgumentPointer(void* in_ptr,
+                        void* wei_ptr,
+                        void* out_ptr,
+                        size_t N,
+                        size_t K,
+                        size_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads) const
+    {
+        return el->MakeArgumentPointer(in_ptr,
+                                       wei_ptr,
+                                       out_ptr,
+                                       N,
+                                       K,
+                                       C,
+                                       input_spatial_lengths,
+                                       filter_spatial_lengths,
+                                       output_spatial_lengths,
+                                       conv_filter_strides,
+                                       conv_filter_dilations,
+                                       input_left_pads,
+                                       input_right_pads,
+                                       PassThrough{},
+                                       PassThrough{},
+                                       PassThrough{});
+    }
+    std::unique_ptr<DeviceConvFwdPtr_t::BaseInvoker> MakeInvokerPointer() const
+    {
+        return el->MakeInvokerPointer();
+    }
+
+    std::string GetTypeString() { return el->GetTypeString(); }
+    bool IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg)
+    {
+        return el->IsSupportedArgument(arg);
+    }
+
+    ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough> el;
+};
+
+DeviceConvFwdPtr_t::DeviceConvFwdPtr_t() : pImpl(nullptr) {}
+DeviceConvFwdPtr_t::~DeviceConvFwdPtr_t()                    = default;
+DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&) = default;
+DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl& other)
+    : pImpl(std::make_unique<DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl>(std::move(other)))
+{
+}
+
+std::unique_ptr<DeviceConvFwdPtr_t::BaseArgument>
+DeviceConvFwdPtr_t::MakeArgumentPointer(void* in_ptr,
+                                        void* wei_ptr,
+                                        void* out_ptr,
+                                        size_t N,
+                                        size_t K,
+                                        size_t C,
+                                        std::vector<ck::index_t> input_spatial_lengths,
+                                        std::vector<ck::index_t> filter_spatial_lengths,
+                                        std::vector<ck::index_t> output_spatial_lengths,
+                                        std::vector<ck::index_t> conv_filter_strides,
+                                        std::vector<ck::index_t> conv_filter_dilations,
+                                        std::vector<ck::index_t> input_left_pads,
+                                        std::vector<ck::index_t> input_right_pads) const
+{
+    return pImpl->MakeArgumentPointer(in_ptr,
+                                      wei_ptr,
+                                      out_ptr,
+                                      N,
+                                      K,
+                                      C,
+                                      input_spatial_lengths,
+                                      filter_spatial_lengths,
+                                      output_spatial_lengths,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads);
+}
+
+std::unique_ptr<DeviceConvFwdPtr_t::BaseInvoker> DeviceConvFwdPtr_t::MakeInvokerPointer() const
+{
+    return pImpl->MakeInvokerPointer();
+}
+
+std::string DeviceConvFwdPtr_t::GetTypeString() { return pImpl->GetTypeString(); }
+bool DeviceConvFwdPtr_t::IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg_ptr)
+{
+    return pImpl->IsSupportedArgument(arg_ptr);
+}
+
+using namespace ck::tensor_operation::device::device_conv2d_fwd_instance;
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances)
+{
+    std::vector<
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
+        local_instances;
+    add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(local_instances);
+    for(auto& kinder : local_instances)
+    {
+        DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+        instances.emplace_back(tmp);
+    }
+    return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances)
+{
+    std::vector<
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
+        local_instances;
+    add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(local_instances);
+    for(auto& kinder : local_instances)
+    {
+        DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+        instances.emplace_back(tmp); // Perhaps we can do better
+    }
+    return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances)
+{
+    std::vector<
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
+        local_instances;
+    add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(local_instances);
+    for(auto& kinder : local_instances)
+    {
+        DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+        instances.emplace_back(tmp); // Perhaps we can do better
+    }
+    return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances)
+{
+    std::vector<
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
+        local_instances;
+    add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(local_instances);
+    for(auto& kinder : local_instances)
+    {
+        DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+        instances.emplace_back(tmp); // Perhaps we can do better
+    }
+    return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances)
+{
+    std::vector<
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
+        local_instances;
+    add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(local_instances);
+    for(auto& kinder : local_instances)
+    {
+        DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+        instances.emplace_back(tmp);
+    }
+    return;
+}
diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
index 5f057adcc5..556b06d7e1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -35,10 +35,9 @@ set(DEVICE_GEMM_INSTANCE_SOURCE
    device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
 )
 
-add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE})
+add_library(device_gemm_instance OBJECT ${DEVICE_GEMM_INSTANCE_SOURCE})
 
 target_compile_features(device_gemm_instance PUBLIC)
 set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_instance LIBRARY DESTINATION lib)
 
 clang_tidy_check(device_gemm_instance)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
index a0e5ba61a1..e2b0abb1d1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
@@ -10,9 +10,7 @@ set(DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE
    device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp;
 )
 
-add_library(device_gemm_bias2d_instance SHARED ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE})
-target_compile_features(device_gemm_bias2d_instance PUBLIC)
+add_library(device_gemm_bias2d_instance OBJECT ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE})
 set_target_properties(device_gemm_bias2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_bias2d_instance LIBRARY DESTINATION lib)
 
 clang_tidy_check(device_gemm_bias2d_instance)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
index 69e05673d6..e2e7d4badd 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE
    device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp;
 )
 
-add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE}) 
-target_compile_features(device_gemm_bias_relu_instance PUBLIC)
+add_library(device_gemm_bias_relu_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE}) 
 set_target_properties(device_gemm_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_bias_relu_instance LIBRARY DESTINATION lib)
 
 clang_tidy_check(device_gemm_bias_relu_instance)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
index 016bc4be2d..a10dbb555d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE
    device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp;
 )
 
-add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE}) 
-target_compile_features(device_gemm_bias_relu_add_instance PUBLIC)
+add_library(device_gemm_bias_relu_add_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE}) 
 set_target_properties(device_gemm_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_bias_relu_add_instance LIBRARY DESTINATION lib)
 
 clang_tidy_check(device_gemm_bias_relu_add_instance)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
index 8f591d8c49..6c5e31fddd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
@@ -6,7 +6,7 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
    device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp;
 )
 
-add_library(device_grouped_gemm_instance SHARED ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE}) 
+add_library(device_grouped_gemm_instance OBJECT ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE}) 
 
 target_compile_features(device_grouped_gemm_instance PUBLIC)
 set_target_properties(device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
index cced3a4b76..81987ac0d4 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
@@ -38,9 +38,7 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE
    device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp;
 )
 
-add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE}) 
-target_compile_features(device_reduce_instance PUBLIC)
+add_library(device_reduce_instance OBJECT ${DEVICE_REDUCE_INSTANCE_SOURCE}) 
 set_target_properties(device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_reduce_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_reduce_instance)
diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profile_batched_gemm_impl.hpp
index 7abbf7a042..3393110c33 100644
--- a/profiler/include/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profile_batched_gemm_impl.hpp
@@ -63,7 +63,7 @@ template <typename ADataType,
 bool profile_batched_gemm_impl(int do_verification,
                                int init_method,
                                bool do_log,
-                               int nrepeat,
+                               bool time_kernel,
                                int M,
                                int N,
                                int K,
@@ -356,11 +356,12 @@ bool profile_batched_gemm_impl(int do_verification,
         {
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
 
-            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
+            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
                                      sizeof(CDataType) * M * N) *
                                     BatchCount;
 
diff --git a/profiler/include/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profile_batched_gemm_reduce_impl.hpp
index a6399c20d8..bd74dbf459 100644
--- a/profiler/include/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp
@@ -53,7 +53,7 @@ template <typename ADataType,
 bool profile_batched_gemm_reduce_impl(int do_verification,
                                       int init_method,
                                       bool do_log,
-                                      int nrepeat,
+                                      bool time_kernel,
                                       int M,
                                       int N,
                                       int K,
@@ -259,30 +259,12 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
 
         if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
         {
-            // warm up
-            invoker_ptr->Run(argument_ptr.get());
+            // init DO, D1 to 0
+            d0_device_buf.SetZero();
+            d1_device_buf.SetZero();
 
-            // timing
-            float total_time = 0;
-
-            for(int i = 0; i < nrepeat; ++i)
-            {
-                // init DO, D1 to 0
-                d0_device_buf.SetZero();
-                d1_device_buf.SetZero();
-
-                KernelTimer timer;
-
-                timer.Start();
-
-                invoker_ptr->Run(argument_ptr.get());
-
-                timer.End();
-
-                total_time += timer.GetElapsedTime();
-            }
-
-            float ave_time = total_time / nrepeat;
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::string gemm_name = gemm_ptr->GetTypeString();
 
diff --git a/profiler/include/profile_conv_bwd_data_impl.hpp b/profiler/include/profile_conv_bwd_data_impl.hpp
index bec97e40f5..dfec033737 100644
--- a/profiler/include/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profile_conv_bwd_data_impl.hpp
@@ -51,7 +51,7 @@ template <int NDimSpatial,
 void profile_conv_bwd_data_impl(int do_verification,
                                 int init_method,
                                 bool do_log,
-                                int nrepeat,
+                                bool time_kernel,
                                 ck::index_t N,
                                 ck::index_t K,
                                 ck::index_t C,
@@ -228,7 +228,8 @@ void profile_conv_bwd_data_impl(int do_verification,
         {
             std::string conv_name = conv_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamControl{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
diff --git a/profiler/include/profile_conv_bwd_weight_impl.hpp b/profiler/include/profile_conv_bwd_weight_impl.hpp
index 20fe0ef549..8e3a4074b0 100644
--- a/profiler/include/profile_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profile_conv_bwd_weight_impl.hpp
@@ -1,4 +1,6 @@
 #pragma once
+
+#include "stream_config.hpp"
 #include "config.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
@@ -43,7 +45,7 @@ template <int NDimSpatial,
 bool profile_conv_bwd_weight_impl(int do_verification,
                                   int init_method,
                                   bool do_log,
-                                  int nrepeat,
+                                  bool time_kernel,
                                   ck::index_t N,
                                   ck::index_t K,
                                   ck::index_t C,
@@ -182,6 +184,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
 
     // profile device Conv instances
     bool pass = true;
+
     for(auto& conv_ptr : conv_ptrs)
     {
         // using atomic, so need to reset input
@@ -189,6 +192,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
         {
             wei_device_buf.SetZero();
         }
+
         auto argument_ptr = conv_ptr->MakeArgumentPointer(
             static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
             static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
@@ -214,7 +218,8 @@ bool profile_conv_bwd_weight_impl(int do_verification,
         {
             std::string conv_name = conv_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
@@ -242,6 +247,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
                 wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
 
                 float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result);
+
                 if(max_error > 8)
                 {
                     pass = false;
diff --git a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
index d0de7307d2..5ea35cd72f 100644
--- a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
@@ -42,7 +42,7 @@ template <int NDimSpatial,
 void profile_conv_fwd_bias_relu_add_impl(int do_verification,
                                          int init_method,
                                          bool do_log,
-                                         int nrepeat,
+                                         bool time_kernel,
                                          ck::index_t N,
                                          ck::index_t K,
                                          ck::index_t C,
@@ -219,7 +219,8 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
         {
             std::string conv_name = op_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
diff --git a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
index 9bdfa61283..f1c2fd300a 100644
--- a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
@@ -119,7 +119,7 @@ template <int NDimSpatial,
 void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
                                                 int init_method,
                                                 bool do_log,
-                                                int nrepeat,
+                                                bool time_kernel,
                                                 ck::index_t N,
                                                 ck::index_t K,
                                                 ck::index_t C,
@@ -275,7 +275,8 @@ void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
         {
             std::string conv_name = op_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
diff --git a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
index f34e52048e..eeb2b93e4e 100644
--- a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
@@ -41,7 +41,7 @@ template <int NDimSpatial,
 void profile_conv_fwd_bias_relu_impl(int do_verification,
                                      int init_method,
                                      bool do_log,
-                                     int nrepeat,
+                                     bool time_kernel,
                                      ck::index_t N,
                                      ck::index_t K,
                                      ck::index_t C,
@@ -207,7 +207,8 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
         {
             std::string conv_name = op_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp
index 5b1ba71163..291bf2abc0 100644
--- a/profiler/include/profile_convnd_bwd_data_impl.hpp
+++ b/profiler/include/profile_convnd_bwd_data_impl.hpp
@@ -269,7 +269,7 @@ template <int NDimSpatial,
 bool profile_convnd_bwd_data_impl(int do_verification,
                                   int init_method,
                                   bool do_log,
-                                  int nrepeat,
+                                  bool time_kernel,
                                   ck::index_t N,
                                   ck::index_t K,
                                   ck::index_t C,
@@ -410,7 +410,8 @@ bool profile_convnd_bwd_data_impl(int do_verification,
         {
             std::string conv_name = conv_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop =
                 ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
diff --git a/profiler/include/profile_gemm_bias_2d_impl.hpp b/profiler/include/profile_gemm_bias_2d_impl.hpp
index 98e4ad76c9..8565f9637c 100644
--- a/profiler/include/profile_gemm_bias_2d_impl.hpp
+++ b/profiler/include/profile_gemm_bias_2d_impl.hpp
@@ -65,7 +65,7 @@ template <typename ADataType,
 void profile_gemm_bias_2d_impl(int do_verification,
                                int init_method,
                                bool do_log,
-                               int nrepeat,
+                               bool time_kernel,
                                int M,
                                int N,
                                int K,
@@ -259,7 +259,8 @@ void profile_gemm_bias_2d_impl(int do_verification,
         {
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * M * N * K;
 
diff --git a/profiler/include/profile_gemm_bias_relu_add_impl.hpp b/profiler/include/profile_gemm_bias_relu_add_impl.hpp
index 75ed78075b..6fec17c199 100644
--- a/profiler/include/profile_gemm_bias_relu_add_impl.hpp
+++ b/profiler/include/profile_gemm_bias_relu_add_impl.hpp
@@ -48,7 +48,7 @@ template <typename ADataType,
 void profile_gemm_bias_relu_add_impl(int do_verification,
                                      int init_method,
                                      bool do_log,
-                                     int nrepeat,
+                                     bool time_kernel,
                                      int M,
                                      int N,
                                      int K,
@@ -232,7 +232,8 @@ void profile_gemm_bias_relu_add_impl(int do_verification,
         {
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * M * N * K;
 
diff --git a/profiler/include/profile_gemm_bias_relu_impl.hpp b/profiler/include/profile_gemm_bias_relu_impl.hpp
index 0735f3c31b..69010becc5 100644
--- a/profiler/include/profile_gemm_bias_relu_impl.hpp
+++ b/profiler/include/profile_gemm_bias_relu_impl.hpp
@@ -48,7 +48,7 @@ template <typename ADataType,
 void profile_gemm_bias_relu_impl(int do_verification,
                                  int init_method,
                                  bool do_log,
-                                 int nrepeat,
+                                 bool time_kernel,
                                  int M,
                                  int N,
                                  int K,
@@ -212,7 +212,8 @@ void profile_gemm_bias_relu_impl(int do_verification,
         {
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * M * N * K;
 
diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp
index 93262fe802..45e6174260 100644
--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
@@ -91,7 +91,7 @@ template <typename ADataType,
 void profile_gemm_impl(int do_verification,
                        int init_method,
                        bool do_log,
-                       int nrepeat,
+                       bool time_kernel,
                        int M,
                        int N,
                        int K,
@@ -416,7 +416,8 @@ void profile_gemm_impl(int do_verification,
 
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * M * N * K;
 
diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp
index 6ef3e010b1..d034c9f750 100644
--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -52,7 +52,7 @@ template <typename ADataType,
 bool profile_gemm_reduce_impl(int do_verification,
                               int init_method,
                               bool do_log,
-                              int nrepeat,
+                              bool time_kernel,
                               int M,
                               int N,
                               int K,
@@ -243,36 +243,18 @@ bool profile_gemm_reduce_impl(int do_verification,
 
         if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
         {
-            // warm up
-            invoker_ptr->Run(argument_ptr.get());
+            // init DO, D1 to 0
+            d0_device_buf.SetZero();
+            d1_device_buf.SetZero();
 
-            // timing
-            float total_time = 0;
-
-            for(int i = 0; i < nrepeat; ++i)
-            {
-                // init DO, D1 to 0
-                d0_device_buf.SetZero();
-                d1_device_buf.SetZero();
-
-                KernelTimer timer;
-
-                timer.Start();
-
-                invoker_ptr->Run(argument_ptr.get());
-
-                timer.End();
-
-                total_time += timer.GetElapsedTime();
-            }
-
-            float ave_time = total_time / nrepeat;
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::string gemm_name = gemm_ptr->GetTypeString();
 
             std::size_t flop = std::size_t(2) * M * N * K;
 
-            std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
+            std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
                                     sizeof(CDataType) * M * N + sizeof(CDataType) * N;
 
             float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp
index ae70f551f1..96d34c7e42 100644
--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -49,7 +49,7 @@ template <typename ADataType,
 void profile_grouped_gemm_impl(int do_verification,
                                int init_method,
                                bool do_log,
-                               int nrepeat,
+                               bool time_kernel,
                                const std::vector<int>& Ms,
                                const std::vector<int>& Ns,
                                const std::vector<int>& Ks,
@@ -231,7 +231,8 @@ void profile_grouped_gemm_impl(int do_verification,
         {
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = 0, num_btype = 0;
             for(std::size_t i = 0; i < gemm_shapes.size(); i++)
diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp
index 678134f60b..33c7929ddd 100644
--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -157,7 +157,7 @@ void profile_reduce_impl_impl(bool do_verification,
                               int init_method,
                               bool do_log,
                               bool do_dumpout,
-                              int nrepeat,
+                              bool time_kernel,
                               const std::vector<size_t>& inLengths,
                               const std::vector<int>& reduceDims,
                               float alpha,
@@ -430,7 +430,8 @@ void profile_reduce_impl_impl(bool do_verification,
 
             auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
 
-            float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t num_bytes =
                 invariant_total_length * reduce_total_length * sizeof(InDataType) +
@@ -516,7 +517,8 @@ void profile_reduce_impl_impl(bool do_verification,
 
             auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
 
-            float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t num_bytes =
                 invariant_total_length * reduce_total_length * sizeof(InDataType) +
@@ -554,7 +556,8 @@ void profile_reduce_impl_impl(bool do_verification,
 
                 auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
 
-                float avg_time_2 = invoker2_ptr->Run(argument2_ptr.get(), nrepeat);
+                float avg_time_2 =
+                    invoker2_ptr->Run(argument2_ptr.get(), StreamConfig{nullptr, time_kernel});
 
                 std::size_t num_bytes_2 =
                     static_cast<size_t>(inLengths2[0]) * inLengths2[1] * sizeof(AccDataType);
@@ -625,7 +628,7 @@ void profile_reduce_impl(bool do_verification,
                          int init_method,
                          bool do_log,
                          bool do_dumpout,
-                         int nrepeat,
+                         bool time_kernel,
                          const std::vector<size_t>& inLengths,
                          const std::vector<int>& reduceDims,
                          ReduceTensorOp ReduceOpId,
@@ -663,7 +666,7 @@ void profile_reduce_impl(bool do_verification,
             init_method,
             do_log,
             do_dumpout,
-            nrepeat,
+            time_kernel,
             inLengths,
             reduceDims,
             alpha,
diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp
index 2a806b0818..db5486e0ac 100644
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -48,8 +48,8 @@ int profile_batched_gemm(int argc, char* argv[])
         printf("                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
         exit(1);
     }
@@ -59,7 +59,7 @@ int profile_batched_gemm(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -82,7 +82,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -102,7 +102,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -122,7 +122,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -142,7 +142,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -162,7 +162,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -182,7 +182,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -202,7 +202,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -222,7 +222,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -242,7 +242,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -262,7 +262,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -282,7 +282,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -302,7 +302,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -322,7 +322,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -342,7 +342,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -362,7 +362,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -382,7 +382,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp
index 38c3f52193..f67e561865 100644
--- a/profiler/src/profile_batched_gemm_reduce.cpp
+++ b/profiler/src/profile_batched_gemm_reduce.cpp
@@ -33,8 +33,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
         printf("arg15: split k into  mulitiple batch\n");
         exit(1);
@@ -45,7 +45,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -69,7 +69,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -91,7 +91,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -113,7 +113,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -135,7 +135,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_conv_bwd_data.cpp b/profiler/src/profile_conv_bwd_data.cpp
index 2861af3d10..206d486ea0 100644
--- a/profiler/src/profile_conv_bwd_data.cpp
+++ b/profiler/src/profile_conv_bwd_data.cpp
@@ -44,7 +44,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
         printf("arg6: verification (0: no; 1: yes)\n");
         printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
         printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(1);
@@ -57,7 +57,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     const ck::index_t N  = std::stoi(argv[10]);
     const ck::index_t K  = std::stoi(argv[11]);
@@ -96,7 +96,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            StreamControl{nullptr, time_kernel},
             N,
             K,
             C,
@@ -122,7 +122,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            StreamControl{nullptr, time_kernel},
             N,
             K,
             C,
@@ -148,7 +148,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            StreamControl{nullptr, time_kernel},
             N,
             K,
             C,
@@ -174,7 +174,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            StreamControl{nullptr, time_kernel},
             N,
             K,
             C,
diff --git a/profiler/src/profile_conv_bwd_weight.cpp b/profiler/src/profile_conv_bwd_weight.cpp
index 309cc8ea2c..c022d19ee0 100644
--- a/profiler/src/profile_conv_bwd_weight.cpp
+++ b/profiler/src/profile_conv_bwd_weight.cpp
@@ -58,7 +58,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     const ck::index_t N  = std::stoi(argv[10]);
     const ck::index_t K  = std::stoi(argv[11]);
@@ -98,7 +98,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             N,
             K,
             C,
@@ -124,7 +124,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             N,
             K,
             C,
diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/profiler/src/profile_conv_fwd_bias_relu.cpp
index 1c447b483e..28aa49687f 100644
--- a/profiler/src/profile_conv_fwd_bias_relu.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu.cpp
@@ -42,7 +42,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
         printf("arg6: verification (0: no; 1: yes)\n");
         printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
         printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(1);
@@ -55,7 +55,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     const ck::index_t N  = std::stoi(argv[10]);
     const ck::index_t K  = std::stoi(argv[11]);
@@ -93,7 +93,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             N,
             K,
             C,
diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
index 522487c77b..7e033a51e2 100644
--- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
         printf("arg6: verification (0: no; 1: yes)\n");
         printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
         printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(1);
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     const ck::index_t N  = std::stoi(argv[10]);
     const ck::index_t K  = std::stoi(argv[11]);
@@ -94,7 +94,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             N,
             K,
             C,
diff --git a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
index 833f2851db..095536f701 100644
--- a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
         printf("arg6: verification (0: no; 1: yes)\n");
         printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
         printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(1);
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     const ck::index_t N  = std::stoi(argv[10]);
     const ck::index_t K  = std::stoi(argv[11]);
@@ -95,7 +95,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             N,
             K,
             C,
diff --git a/profiler/src/profile_convnd_bwd_data.cpp b/profiler/src/profile_convnd_bwd_data.cpp
index 4d6b9a7b37..5d0e6a34c7 100644
--- a/profiler/src/profile_convnd_bwd_data.cpp
+++ b/profiler/src/profile_convnd_bwd_data.cpp
@@ -95,7 +95,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
         printf("arg6: verification (0: no; 1: yes)\n");
         printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
         printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         return 1;
@@ -108,7 +108,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     ck::utils::conv::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams);
 
@@ -132,7 +132,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
                 do_verification,
                 init_method,
                 do_log,
-                nrepeat,
+                time_kernel,
                 params.N_,
                 params.K_,
                 params.C_,
@@ -157,7 +157,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
                 do_verification,
                 init_method,
                 do_log,
-                nrepeat,
+                time_kernel,
                 params.N_,
                 params.K_,
                 params.C_,
@@ -182,7 +182,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
                 do_verification,
                 init_method,
                 do_log,
-                nrepeat,
+                time_kernel,
                 params.N_,
                 params.K_,
                 params.C_,
diff --git a/profiler/src/profile_convnd_fwd.cpp b/profiler/src/profile_convnd_fwd.cpp
index 7902cdb002..722e86c2ea 100644
--- a/profiler/src/profile_convnd_fwd.cpp
+++ b/profiler/src/profile_convnd_fwd.cpp
@@ -119,7 +119,7 @@ template <int NDim,
 void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
                                    bool do_verification,
                                    bool do_log,
-                                   int nrepeat,
+                                   bool time_kernel,
                                    int init_method,
                                    ConvLayouts)
 {
@@ -185,7 +185,7 @@ void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
                                                                          reference_conv_fwd_fun);
     auto best_conf = run_engine.Profile(
         conv::ConvolutionFwdInstances<InDataType, WeiDataType, OutDataType>::template Get<NDim>(),
-        nrepeat,
+        time_kernel,
         do_verification,
         do_log);
 
@@ -201,7 +201,7 @@ void profile_convnd_instances(ConvDataType data_type,
                               const ck::utils::conv::ConvParams& params,
                               bool do_verification,
                               bool do_log,
-                              int nrepeat,
+                              bool time_kernel,
                               int init_method)
 {
     switch(data_layout)
@@ -214,7 +214,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
             break;
@@ -223,7 +223,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
             break;
@@ -232,7 +232,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
             break;
@@ -241,7 +241,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
             break;
@@ -256,7 +256,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
             break;
@@ -265,7 +265,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
             break;
@@ -274,7 +274,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
             break;
@@ -283,7 +283,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
             break;
@@ -304,7 +304,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
     bool do_verification{true};
     int init_method{2};
     bool do_log{false};
-    int nrepeat{100};
+    bool time_kernel{false};
     int num_dim_spatial{2};
     ConvParams params;
 
@@ -318,7 +318,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
         do_verification = std::stoi(argv[4]);
         init_method     = std::stoi(argv[5]);
         do_log          = std::stoi(argv[6]);
-        nrepeat         = std::stoi(argv[7]);
+        time_kernel     = std::stoi(argv[7]);
         num_dim_spatial = std::stoi(argv[8]);
     }
     if(argc >= 10)
@@ -332,15 +332,15 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
     {
     case 1:
         profile_convnd_instances<1>(
-            data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+            data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
         break;
     case 2:
         profile_convnd_instances<2>(
-            data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+            data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
         break;
     case 3:
         profile_convnd_instances<3>(
-            data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+            data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
         break;
     default:
         throw std::runtime_error("profile_conv_fwd: unsupported num_dim_spatial value: " +
diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp
index 7a72be2d8e..4c6a3b0487 100644
--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -38,8 +38,8 @@ int profile_gemm(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
         printf("arg14: split k into  mulitiple batch\n");
         exit(1);
@@ -50,7 +50,7 @@ int profile_gemm(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -74,7 +74,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -94,7 +94,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -114,7 +114,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -134,7 +134,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -154,7 +154,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -174,7 +174,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -194,7 +194,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -214,7 +214,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -234,7 +234,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -254,7 +254,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -274,7 +274,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -294,7 +294,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -314,7 +314,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -334,7 +334,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -354,7 +354,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -374,7 +374,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_gemm_bias_2d.cpp b/profiler/src/profile_gemm_bias_2d.cpp
index dd7e418087..46d4f90c17 100644
--- a/profiler/src/profile_gemm_bias_2d.cpp
+++ b/profiler/src/profile_gemm_bias_2d.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
         printf("arg14: alpha\n");
         printf("arg15: beta\n");
@@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_gemm_bias_relu.cpp b/profiler/src/profile_gemm_bias_relu.cpp
index 67a47cf9ec..4346650c9f 100644
--- a/profiler/src/profile_gemm_bias_relu.cpp
+++ b/profiler/src/profile_gemm_bias_relu.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
         printf("arg14: split k into  mulitiple batch\n");
         exit(1);
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_gemm_bias_relu_add.cpp b/profiler/src/profile_gemm_bias_relu_add.cpp
index 52406e93d6..186f32cf6f 100644
--- a/profiler/src/profile_gemm_bias_relu_add.cpp
+++ b/profiler/src/profile_gemm_bias_relu_add.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n");
         printf("arg15: split k into  mulitiple batch\n");
         exit(1);
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp
index a83d4ce9a1..986acaf010 100644
--- a/profiler/src/profile_gemm_reduce.cpp
+++ b/profiler/src/profile_gemm_reduce.cpp
@@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
         printf("arg14: split k into  mulitiple batch\n");
         exit(1);
@@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp
index 88a2a8f855..d35484cfae 100644
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
                "64,64 64,64 128,128)\n");
         exit(1);
@@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const auto Ms = argToIntArray(argv[8]);
     const auto Ns = argToIntArray(argv[9]);
@@ -86,7 +86,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                 ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                    init_method,
                                                                                    do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                    Ms,
                                                                                    Ns,
                                                                                    Ks,
@@ -104,7 +104,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                 ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                    init_method,
                                                                                    do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                    Ms,
                                                                                    Ns,
                                                                                    Ks,
@@ -122,7 +122,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                 ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                    init_method,
                                                                                    do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                    Ms,
                                                                                    Ns,
                                                                                    Ks,
@@ -140,7 +140,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                 ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                    init_method,
                                                                                    do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                    Ms,
                                                                                    Ns,
                                                                                    Ks,
diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp
index 96fa78964a..5e91a1d2d1 100644
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
@@ -144,7 +144,7 @@ class AppArgs
     bool do_dumpout                = false;
 
     int init_method;
-    int nrepeat;
+    bool time_kernel;
 
     bool need_indices = false;
 
@@ -295,7 +295,7 @@ class AppArgs
             throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
 
         init_method = std::atoi(argv[optind++]);
-        nrepeat     = std::atoi(argv[optind]);
+        time_kernel = std::atoi(argv[optind]);
 
         if(scales.empty())
         {
@@ -354,7 +354,7 @@ int profile_reduce(int argc, char* argv[])
                                                                     args.init_method,
                                                                     args.do_log,
                                                                     args.do_dumpout,
-                                                                    args.nrepeat,
+                                                                    args.time_kernel,
                                                                     args.inLengths,
                                                                     args.reduceDims,
                                                                     args.reduceOp,
@@ -369,7 +369,7 @@ int profile_reduce(int argc, char* argv[])
                                                                args.init_method,
                                                                args.do_log,
                                                                args.do_dumpout,
-                                                               args.nrepeat,
+                                                               args.time_kernel,
                                                                args.inLengths,
                                                                args.reduceDims,
                                                                args.reduceOp,
@@ -387,7 +387,7 @@ int profile_reduce(int argc, char* argv[])
                                                     args.init_method,
                                                     args.do_log,
                                                     args.do_dumpout,
-                                                    args.nrepeat,
+                                                    args.time_kernel,
                                                     args.inLengths,
                                                     args.reduceDims,
                                                     args.reduceOp,
@@ -414,7 +414,7 @@ int profile_reduce(int argc, char* argv[])
                                                         args.init_method,
                                                         args.do_log,
                                                         args.do_dumpout,
-                                                        args.nrepeat,
+                                                        args.time_kernel,
                                                         args.inLengths,
                                                         args.reduceDims,
                                                         args.reduceOp,
@@ -429,7 +429,7 @@ int profile_reduce(int argc, char* argv[])
                                                          args.init_method,
                                                          args.do_log,
                                                          args.do_dumpout,
-                                                         args.nrepeat,
+                                                         args.time_kernel,
                                                          args.inLengths,
                                                          args.reduceDims,
                                                          args.reduceOp,
@@ -454,7 +454,7 @@ int profile_reduce(int argc, char* argv[])
                                                              args.init_method,
                                                              args.do_log,
                                                              args.do_dumpout,
-                                                             args.nrepeat,
+                                                             args.time_kernel,
                                                              args.inLengths,
                                                              args.reduceDims,
                                                              args.reduceOp,
@@ -471,7 +471,7 @@ int profile_reduce(int argc, char* argv[])
                                                      args.init_method,
                                                      args.do_log,
                                                      args.do_dumpout,
-                                                     args.nrepeat,
+                                                     args.time_kernel,
                                                      args.inLengths,
                                                      args.reduceDims,
                                                      args.reduceOp,
@@ -486,7 +486,7 @@ int profile_reduce(int argc, char* argv[])
                                                       args.init_method,
                                                       args.do_log,
                                                       args.do_dumpout,
-                                                      args.nrepeat,
+                                                      args.time_kernel,
                                                       args.inLengths,
                                                       args.reduceDims,
                                                       args.reduceOp,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8a9db2adbd..c696069393 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -22,6 +22,8 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/external/include/half
 )
 
+include(googletest)
+
 add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
 add_custom_target(tests)
 
@@ -61,4 +63,5 @@ add_subdirectory(grouped_gemm)
 add_subdirectory(convnd_fwd)
 add_subdirectory(reduce)
 add_subdirectory(conv2d_bwd_weight)
-add_subdirectory(convnd_bwd_data)
\ No newline at end of file
+add_subdirectory(convnd_bwd_data)
+# DONOT add client_app, that is tested via CI independently
\ No newline at end of file
diff --git a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
index ce061c644b..7b311cff17 100644
--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -22,7 +22,7 @@ int main()
                                                                   Row,
                                                                   Row,
                                                                   Row>(
-                       true, 1, false, 1, M, N, K, K, N, N, BatchCount);
+                       true, 1, false, false, M, N, K, K, N, N, BatchCount);
 
     pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
                                                                   ck::half_t,
@@ -31,7 +31,7 @@ int main()
                                                                   Row,
                                                                   Col,
                                                                   Row>(
-                       true, 1, false, 1, M, N, K, K, K, N, BatchCount);
+                       true, 1, false, false, M, N, K, K, K, N, BatchCount);
 
     pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
                                                                   ck::half_t,
@@ -40,7 +40,7 @@ int main()
                                                                   Col,
                                                                   Row,
                                                                   Row>(
-                       true, 1, false, 1, M, N, K, M, N, N, BatchCount);
+                       true, 1, false, false, M, N, K, M, N, N, BatchCount);
 
     pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
                                                                   ck::half_t,
@@ -49,7 +49,7 @@ int main()
                                                                   Col,
                                                                   Col,
                                                                   Row>(
-                       true, 1, false, 1, M, N, K, M, K, N, BatchCount);
+                       true, 1, false, false, M, N, K, M, K, N, BatchCount);
 
     if(pass)
     {
diff --git a/test/client_app/CMakeLists.txt b/test/client_app/CMakeLists.txt
new file mode 100644
index 0000000000..f8dd8c4e0a
--- /dev/null
+++ b/test/client_app/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.15)
+project(ck_app)
+add_compile_options(-std=c++14)
+
+find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor)
+find_package(hip REQUIRED PATHS /opt/rocm)
+message(STATUS "Build with HIP ${hip_VERSION}")
+
+add_executable(test_client_app client_app.cpp)
+
+target_link_libraries(test_client_app PRIVATE composable_kernel::device_operations composable_kernel::host_tensor hip::host)
diff --git a/test/client_app/client_app.cpp b/test/client_app/client_app.cpp
new file mode 100644
index 0000000000..665a103f70
--- /dev/null
+++ b/test/client_app/client_app.cpp
@@ -0,0 +1,77 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include <vector>
+
+#include "client_app_impl.hpp"
+
+int main(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    const ConvDataType data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout          = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout         = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout         = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification   = std::stoi(argv[6]);
+    const int init_method        = std::stoi(argv[7]);
+    const bool do_log            = std::stoi(argv[8]);
+    const bool time_kernel       = std::stoi(argv[9]);
+
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    ck::app::profile_conv_fwd_impl(do_verification,
+                                   init_method,
+                                   do_log,
+                                   time_kernel,
+                                   data_type,
+                                   N,
+                                   K,
+                                   C,
+                                   std::vector<ck::index_t>{Hi, Wi},
+                                   std::vector<ck::index_t>{Y, X},
+                                   std::vector<ck::index_t>{Ho, Wo},
+                                   std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+                                   std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+                                   std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+                                   std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    return 1;
+}
diff --git a/test/client_app/client_app_impl.hpp b/test/client_app/client_app_impl.hpp
new file mode 100644
index 0000000000..f9e4145ba0
--- /dev/null
+++ b/test/client_app/client_app_impl.hpp
@@ -0,0 +1,214 @@
+#pragma once
+
+#include "host_interface.hpp"
+
+enum ConvDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+
+void check_hip_error(void)
+{
+    hipError_t err = hipGetLastError();
+    if(err != hipSuccess)
+    {
+        std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
+        exit(err);
+    }
+}
+std::string getDeviceName(int device)
+{
+    struct hipDeviceProp_t prop;
+    hipGetDeviceProperties(&prop, device);
+    check_hip_error();
+    return std::string(prop.name);
+}
+
+int getDriver(void)
+{
+    int driver;
+    hipDriverGetVersion(&driver);
+    check_hip_error();
+    return driver;
+}
+
+namespace ck {
+namespace app {
+struct DeviceMem
+{
+    DeviceMem() = delete;
+    DeviceMem(std::size_t mem_size);
+    void* GetDeviceBuffer();
+    void ToDevice(const void* p);
+    void FromDevice(void* p);
+    ~DeviceMem();
+
+    void* mpDeviceBuf;
+    std::size_t mMemSize;
+};
+
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+{
+    hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+}
+
+void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
+
+void DeviceMem::ToDevice(const void* p)
+{
+    hipGetErrorString(
+        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+}
+
+void DeviceMem::FromDevice(void* p)
+{
+    hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+}
+
+DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+
+void profile_conv_fwd_impl(int do_verification,
+                           int init_method,
+                           bool do_log,
+                           bool time_kernel,
+                           ConvDataType data_type,
+                           ck::index_t N,
+                           ck::index_t K,
+                           ck::index_t C,
+                           std::vector<ck::index_t> input_spatial_lengths,
+                           std::vector<ck::index_t> filter_spatial_lengths,
+                           std::vector<ck::index_t> output_spatial_lengths,
+                           std::vector<ck::index_t> conv_filter_strides,
+                           std::vector<ck::index_t> conv_filter_dilations,
+                           std::vector<ck::index_t> input_left_pads,
+                           std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+
+    const auto in_sz  = N * C * Hi * Wi;
+    const auto wei_sz = K * C * Y * X;
+    const auto out_sz = N * K * Ho * Wo;
+
+    using WeiDataType = float;
+    using InDataType  = float;
+    using OutDataType = float;
+
+    app::DeviceMem in_device_buf(sizeof(InDataType) * in_sz);
+    app::DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_sz);
+    app::DeviceMem out_device_buf(sizeof(OutDataType) * out_sz);
+    // data is already on device!
+
+    // add device Conv instances
+    std::vector<DeviceConvFwdPtr_t> conv_ptrs;
+    if(data_type == F16_F16_F16)
+    {
+        add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
+    }
+    else if(data_type == BF16_BF16_BF16)
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(conv_ptrs);
+    else if(data_type == F32_F32_F32)
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(conv_ptrs);
+    else if(data_type == INT8_INT8_INT8)
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs);
+    else
+        throw std::runtime_error("wrong! Invalid data type");
+    if(conv_ptrs.empty())
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    int deviceIndex       = 0;
+    hipSetDevice(deviceIndex);
+    check_hip_error();
+
+    StreamConfig stream_config{nullptr, time_kernel};
+    hipStreamCreate(&stream_config.stream_id_);
+    check_hip_error();
+
+    // profile device Conv instances
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto argument_ptr =
+            conv_ptr.MakeArgumentPointer(static_cast<void*>(in_device_buf.GetDeviceBuffer()),
+                                         static_cast<void*>(wei_device_buf.GetDeviceBuffer()),
+                                         static_cast<void*>(out_device_buf.GetDeviceBuffer()),
+                                         N,
+                                         K,
+                                         C,
+                                         input_spatial_lengths,
+                                         filter_spatial_lengths,
+                                         output_spatial_lengths,
+                                         conv_filter_strides,
+                                         conv_filter_dilations,
+                                         input_left_pads,
+                                         input_right_pads);
+
+        auto invoker_ptr = conv_ptr.MakeInvokerPointer();
+
+        if(conv_ptr.IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = conv_ptr.GetTypeString();
+            float ave_time        = invoker_ptr->Run(argument_ptr.get(), stream_config);
+
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+
+            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                                    sizeof(WeiDataType) * (K * C * Y * X) +
+                                    sizeof(OutDataType) * (N * K * Ho * Wo);
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_conv_name  = conv_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+
+} // namespace app
+} // namespace ck
diff --git a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
index 085473f695..671980f49e 100644
--- a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
+++ b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
@@ -28,10 +28,10 @@ int test_self()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -52,10 +52,10 @@ int test_self()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -72,8 +72,8 @@ int test_self()
 }
 int main(int argc, char* argv[])
 {
-    int data_type   = 0;
-    int init_method = 0;
+    int data_type   = 1;
+    int init_method = 1;
 
     // Conv shape
     ck::index_t N               = 128;
@@ -155,10 +155,10 @@ int main(int argc, char* argv[])
                                                               ck::tensor_layout::convolution::NHWC,
                                                               ck::tensor_layout::convolution::KYXC,
                                                               ck::tensor_layout::convolution::NHWK>(
-                1,
+                true, // do_verification
                 init_method,
-                0,
-                1,
+                false, // do_log
+                false, // time_kernel
                 param.N_,
                 param.K_,
                 param.C_,
@@ -180,10 +180,10 @@ int main(int argc, char* argv[])
                                                               ck::tensor_layout::convolution::NHWC,
                                                               ck::tensor_layout::convolution::KYXC,
                                                               ck::tensor_layout::convolution::NHWK>(
-                1,
+                true, // do_verification
                 init_method,
-                0,
-                1,
+                false, // do_log
+                false, // time_kernel
                 param.N_,
                 param.K_,
                 param.C_,
diff --git a/test/convnd_bwd_data/convnd_bwd_data.cpp b/test/convnd_bwd_data/convnd_bwd_data.cpp
index 0b6ddb1405..7284680e0e 100644
--- a/test/convnd_bwd_data/convnd_bwd_data.cpp
+++ b/test/convnd_bwd_data/convnd_bwd_data.cpp
@@ -27,10 +27,10 @@ int main()
                                                            ck::tensor_layout::convolution::NWC,
                                                            ck::tensor_layout::convolution::KXC,
                                                            ck::tensor_layout::convolution::NWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -50,10 +50,10 @@ int main()
                                                            ck::tensor_layout::convolution::NWC,
                                                            ck::tensor_layout::convolution::KXC,
                                                            ck::tensor_layout::convolution::NWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -73,10 +73,10 @@ int main()
                                                            ck::tensor_layout::convolution::NWC,
                                                            ck::tensor_layout::convolution::KXC,
                                                            ck::tensor_layout::convolution::NWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -96,10 +96,10 @@ int main()
                                                            ck::tensor_layout::convolution::NWC,
                                                            ck::tensor_layout::convolution::KXC,
                                                            ck::tensor_layout::convolution::NWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -128,10 +128,10 @@ int main()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -151,10 +151,10 @@ int main()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -174,10 +174,10 @@ int main()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -197,10 +197,10 @@ int main()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -232,10 +232,10 @@ int main()
                                                            ck::tensor_layout::convolution::NDHWC,
                                                            ck::tensor_layout::convolution::KZYXC,
                                                            ck::tensor_layout::convolution::NDHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -255,10 +255,10 @@ int main()
                                                            ck::tensor_layout::convolution::NDHWC,
                                                            ck::tensor_layout::convolution::KZYXC,
                                                            ck::tensor_layout::convolution::NDHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -278,10 +278,10 @@ int main()
                                                            ck::tensor_layout::convolution::NDHWC,
                                                            ck::tensor_layout::convolution::KZYXC,
                                                            ck::tensor_layout::convolution::NDHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
@@ -301,10 +301,10 @@ int main()
                                                            ck::tensor_layout::convolution::NDHWC,
                                                            ck::tensor_layout::convolution::KZYXC,
                                                            ck::tensor_layout::convolution::NDHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
             param.N_,
             param.K_,
             param.C_,
diff --git a/test/gemm_reduce/gemm_reduce_fp16.cpp b/test/gemm_reduce/gemm_reduce_fp16.cpp
index 8deb66b2b0..6c7bb9658f 100644
--- a/test/gemm_reduce/gemm_reduce_fp16.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16.cpp
@@ -16,22 +16,22 @@ int main()
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Row, Row>(
-                   true, 1, false, 1, M, N, K, K, N, N);
+                   true, 1, false, false, M, N, K, K, N, N);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Col, Row>(
-                   true, 1, false, 1, M, N, K, K, K, N);
+                   true, 1, false, false, M, N, K, K, K, N);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Row, Row>(
-                   true, 1, false, 1, M, N, K, M, N, N);
+                   true, 1, false, false, M, N, K, M, N, N);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Col, Row>(
-                   true, 1, false, 1, M, N, K, M, K, N);
+                   true, 1, false, false, M, N, K, M, K, N);
 
     if(pass)
     {
diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp
index c788b66aa3..b63361aa1b 100644
--- a/test/gemm_split_k/gemm_split_k.cpp
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -187,9 +187,10 @@ int test_gemm(const gemmArgs& args)
 
         if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
         {
-            invoker_ptr->Run(argument_ptr.get(), 0);
+            invoker_ptr->Run(argument_ptr.get());
 
             c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
             if(!check_out(c_m_n_host_result, c_m_n_device_result))
             {
                 success = false;