diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b798e38f3..a3ec91e3bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
 
+option(CK_TIME_KERNEL "Turning off will disable kernel timing globally" ON)
+
 ## OpenMP
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	# workaround issue hipcc in rocm3.5 cannot find openmp
@@ -72,8 +74,9 @@ message(STATUS "Build with HIP ${HIP_VERSION}")
 
 
 rocm_create_package(
-    NAME CK-${CK_BACKEND}
+    NAME composablekernel
     DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
+    MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
     LDCONFIG
 )
 
@@ -226,7 +229,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 
-configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp")
+configure_file("${PROJECT_SOURCE_DIR}/include/ck/options.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/options.hpp")
 
 include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/include
@@ -234,7 +237,6 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/library/include
 )
 
-include(googletest)
 
 SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
 if(BUILD_DEV)
@@ -243,7 +245,31 @@ if(BUILD_DEV)
 endif()
 message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
 
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+
 add_subdirectory(library)
 add_subdirectory(example)
 add_subdirectory(test)
 add_subdirectory(profiler)
+
+#Create an interface target for the include only files and call it "composablekernels"
+include(CMakePackageConfigHelpers)
+
+set(version 1.0.0)
+write_basic_package_version_file(
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+    VERSION "${version}"
+    COMPATIBILITY AnyNewerVersion
+)
+
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+        "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+        INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+        NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+
+install(FILES
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
diff --git a/Config.cmake.in b/Config.cmake.in
new file mode 100644
index 0000000000..12b5c331ae
--- /dev/null
+++ b/Config.cmake.in
@@ -0,0 +1,11 @@
+@PACKAGE_INIT@
+
+set(_composable_kernel_supported_components device_operations host_tensor)
+
+foreach(_comp ${composable_kernel_FIND_COMPONENTS})
+	if(NOT _comp IN_LIST _composable_kernel_supported_components)
+		set(composable_kernel_FOUND False)
+		set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}")
+	endif()
+	include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake")
+endforeach()
diff --git a/Dockerfile b/Dockerfile
index c4cf0fac57..9a443e01de 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,13 +11,7 @@ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
 RUN apt-get update
 RUN apt-get install -y wget gnupg
 RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN if ! [ -z $OSDB_BKC_VERSION ]; then \
-       echo "Using BKC VERISION: $OSDB_BKC_VERSION";\
-       sh -c "echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-osdb-deb/ compute-rocm-dkms-no-npi-hipclang ${OSDB_BKC_VERSION} > /etc/apt/sources.list.d/rocm.list" ;\
-       cat  /etc/apt/sources.list.d/rocm.list;\
-    else \
-       sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" ;\
-    fi
+RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
 RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
 
@@ -25,18 +19,15 @@ RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/ap
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
     apt-utils \
-    sshpass \
     build-essential \
     cmake-data=3.15.1-0kitware1 \
     cmake=3.15.1-0kitware1 \
     curl \
-    doxygen \
     g++ \
     gdb \
     git \
     hip-rocclr \
     jq \
-    lcov \
     libelf-dev \
     libncurses5-dev \
     libnuma-dev \
@@ -62,8 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-# RUN pip3 install --default-timeout=100000 -r requirements.txt
-
 # Setup ubsan environment to printstacktrace
 RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
 ENV UBSAN_OPTIONS=print_stacktrace=1
@@ -92,5 +81,3 @@ ADD rbuild.ini /rbuild.ini
 ADD dev-requirements.txt dev-requirements.txt
 RUN rbuild prepare -s develop -d $PREFIX
 RUN groupadd -f render
-# RUN cget install -f min-requirements.txt
-# RUN CXXFLAGS='-isystem $PREFIX/include' cget install -f ./mlir-requirements.txt
diff --git a/Jenkinsfile b/Jenkinsfile
index f065d4ecc5..77f4d9d8be 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -320,7 +320,7 @@ pipeline {
                 {
                     agent{ label rocmnode("gfx908")}
                     environment{
-                        setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
+                        setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906  --offload-arch=gfx908 --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
@@ -341,6 +341,23 @@ pipeline {
 
             }
         }
+        stage("Client App")
+        {
+            parallel
+            {
+                stage("Run Client App")
+                {
+                    agent{ label rocmnode("gfx908")}
+                    environment{
+                        setup_args = """ -D  -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " """
+                        execute_args = """ cd ../test/client_app && rm -rf build && mkdir build && cd build && cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" .. && make  """ 
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                    }
+                }
+            }
+        }
         stage("Performance Tests")
         {
             parallel
diff --git a/README.md b/README.md
index f5341b5736..9d7b578046 100644
--- a/README.md
+++ b/README.md
@@ -43,3 +43,13 @@ Instructions for running each individual examples are under ```example/```
  make -j ckProfiler
 ```
 Instructions for running ckProfiler are under ```profiler/```
+
+
+## Caveat
+### Kernel Timing and Verification
+CK's own kernel timer will warn up kernel once, and then run it multiple times
+to get average kernel time. For some kernels that use atomic add, this will cause
+output buffer to be accumulated multiple times, causing verfication failure.
+To work around it, do not use CK's own timer and do verification at the same time.
+CK's own timer and verification in each example and ckProfiler can be enabled or
+disabled from command line.
diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
index c7e70cc8a9..959bc4f4b0 100644
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -18,6 +18,8 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
      -Wno-switch-enum
      -Wno-zero-as-null-pointer-constant
      -Wno-unused-member-function
+     -Wno-comma
+     -Wno-old-style-cast
 )
 message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")
 
@@ -33,4 +35,5 @@ FetchContent_MakeAvailable(googletest)
 
 target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
 target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
-
+target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
+target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
index a4567dcd6e..060750e676 100644
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -88,9 +88,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -105,13 +105,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -125,7 +125,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -198,7 +198,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
@@ -232,7 +232,7 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
-        ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData);
+        return ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData) ? 0 : 1;
     }
 
     return 0;
diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp
index fc04a13ca5..06523037f9 100644
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -56,9 +56,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -73,13 +73,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -93,7 +93,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -171,7 +171,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
@@ -196,7 +196,7 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
-        ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+        return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
     }
 
     return 0;
diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp
index ab5869db61..a22c21e40e 100644
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -120,7 +120,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -194,7 +194,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
@@ -219,7 +219,7 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
-        ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+        return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
     }
 
     return 0;
diff --git a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
index 2abebbbac4..1a6e1de4dc 100644
--- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
+++ b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
@@ -86,9 +86,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBias2D<AD
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -106,13 +106,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 6)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         alpha = std::stof(argv[4]);
         beta  = std::stof(argv[5]);
@@ -121,7 +121,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -138,7 +138,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, alpha, beta\n");
         exit(0);
     }
@@ -216,7 +216,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
@@ -246,6 +246,8 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
-        ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+        return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
     }
+
+    return 0;
 }
diff --git a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
index f3ed2bad37..3bf3003c14 100644
--- a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
+++ b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActiv
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -100,13 +100,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -120,7 +120,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -206,7 +206,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
 
@@ -232,6 +232,8 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
-        ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+        return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
     }
+
+    return 0;
 }
diff --git a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
index 9405c36881..73e92f9d11 100644
--- a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
+++ b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
@@ -83,9 +83,9 @@ using ReferenceGemmInstance =
                                                                CElementOp>;
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -101,13 +101,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 11)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -122,7 +122,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, StrideC1\n");
         exit(0);
     }
@@ -218,7 +218,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop      = std::size_t(2) * M * N * K;
     std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
@@ -250,6 +250,8 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
-        ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+        return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
     }
+
+    return 0;
 }
diff --git a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
index df8f70606c..4e1dd1f3e6 100644
--- a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
+++ b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp)
-target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_fwd_util)
+target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_util)
diff --git a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
index 751ce16b90..d50afb6854 100644
--- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
@@ -7,7 +7,7 @@
 
 #include "check_err.hpp"
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "device.hpp"
 #include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
 #include "device_tensor.hpp"
@@ -93,7 +93,7 @@ void PrintUseMsg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "Following arguments:\n"
               << " N, K, C, \n"
               << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
@@ -120,40 +120,40 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
     ck::utils::conv::ConvParams params;
     int arg_idx = 4;
 
-    params.num_dim_spatial = num_dim_spatial;
-    params.N               = std::stoi(argv[arg_idx++]);
-    params.K               = std::stoi(argv[arg_idx++]);
-    params.C               = std::stoi(argv[arg_idx++]);
+    params.num_dim_spatial_ = num_dim_spatial;
+    params.N_               = std::stoi(argv[arg_idx++]);
+    params.K_               = std::stoi(argv[arg_idx++]);
+    params.C_               = std::stoi(argv[arg_idx++]);
 
-    params.filter_spatial_lengths.resize(num_dim_spatial);
+    params.filter_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_spatial_lengths.resize(num_dim_spatial);
+    params.input_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_strides.resize(num_dim_spatial);
+    params.conv_filter_strides_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_dilations.resize(num_dim_spatial);
+    params.conv_filter_dilations_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_left_pads.resize(num_dim_spatial);
+    params.input_left_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_right_pads.resize(num_dim_spatial);
+    params.input_right_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
     }
 
     return params;
@@ -165,9 +165,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::utils::conv;
 
-    bool do_verification      = 0;
-    int init_method           = 0;
-    int nrepeat               = 5;
+    bool do_verification      = true;
+    int init_method           = 1;
+    bool time_kernel          = false;
     const int num_dim_spatial = 2;
 
     ck::utils::conv::ConvParams params;
@@ -176,7 +176,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
 
     if(argc >= 5)
@@ -184,21 +184,21 @@ int main(int argc, char* argv[])
         params = ParseConvParams(argc, argv);
     }
 
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
-                                        static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
+                                        static_cast<std::size_t>(params.C_)};
     input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths),
-                      std::end(params.input_spatial_lengths));
+                      std::begin(params.input_spatial_lengths_),
+                      std::end(params.input_spatial_lengths_));
 
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
-                                         static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
+                                         static_cast<std::size_t>(params.C_)};
     filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths),
-                       std::end(params.filter_spatial_lengths));
+                       std::begin(params.filter_spatial_lengths_),
+                       std::end(params.filter_spatial_lengths_));
 
     const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
-                                         static_cast<std::size_t>(params.K)};
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
+                                         static_cast<std::size_t>(params.K_)};
     output_dims.insert(std::end(output_dims),
                        std::begin(output_spatial_lengths),
                        std::end(output_spatial_lengths));
@@ -211,7 +211,7 @@ int main(int argc, char* argv[])
         get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
     // bias: assume contiguous 1d vector
     Tensor<OutDataType> bias(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K)})));
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
 
     std::cout << "input: " << input.mDesc << std::endl;
     std::cout << "weights: " << weights.mDesc << std::endl;
@@ -248,16 +248,16 @@ int main(int argc, char* argv[])
                           static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                           static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
                           static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
-                          params.N,
-                          params.K,
-                          params.C,
-                          params.input_spatial_lengths,
-                          params.filter_spatial_lengths,
+                          params.N_,
+                          params.K_,
+                          params.C_,
+                          params.input_spatial_lengths_,
+                          params.filter_spatial_lengths_,
                           output_spatial_lengths,
-                          params.conv_filter_strides,
-                          params.conv_filter_dilations,
-                          params.input_left_pads,
-                          params.input_right_pads,
+                          params.conv_filter_strides_,
+                          params.conv_filter_dilations_,
+                          params.input_left_pads_,
+                          params.input_right_pads_,
                           InElementOp{},
                           WeiElementOp{},
                           OutElementOp{});
@@ -269,18 +269,18 @@ int main(int argc, char* argv[])
             "not support this problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = get_flops(
-        params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
     std::size_t num_btype =
-        get_btype<InDataType, WeiDataType, OutDataType>(params.N,
-                                                        params.C,
-                                                        params.K,
-                                                        params.input_spatial_lengths,
-                                                        params.filter_spatial_lengths,
+        get_btype<InDataType, WeiDataType, OutDataType>(params.N_,
+                                                        params.C_,
+                                                        params.K_,
+                                                        params.input_spatial_lengths_,
+                                                        params.filter_spatial_lengths_,
                                                         output_spatial_lengths) +
-        sizeof(OutDataType) * (params.K);
+        sizeof(OutDataType) * (params.K_);
 
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
     float gb_per_sec = num_btype / 1.E6 / ave_time;
@@ -296,16 +296,17 @@ int main(int argc, char* argv[])
                                                   weights,
                                                   host_output,
                                                   bias,
-                                                  params.conv_filter_strides,
-                                                  params.conv_filter_dilations,
-                                                  params.input_left_pads,
-                                                  params.input_right_pads,
+                                                  params.conv_filter_strides_,
+                                                  params.conv_filter_dilations_,
+                                                  params.input_left_pads_,
+                                                  params.input_right_pads_,
                                                   InElementOp{},
                                                   WeiElementOp{},
                                                   OutElementOp{});
         ref_invoker.Run(ref_argument);
         out_device_buf.FromDevice(device_output.mData.data());
-        ck::utils::check_err(
-            host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+        return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1;
     }
+
+    return 0;
 }
diff --git a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
index 8bc5980025..b4dd39d83a 100644
--- a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
@@ -1,2 +1,3 @@
-add_example_executable(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp)
-target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_fwd_util)
+# FIXME: should fix validation failure
+add_example_executable_no_testing(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp)
+target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_util)
diff --git a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
index e6339fcd23..53d882778a 100644
--- a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+++ b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
@@ -7,7 +7,7 @@
 
 #include "check_err.hpp"
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "device.hpp"
 #include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
 #include "device_tensor.hpp"
@@ -90,7 +90,7 @@ void PrintUseMsg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "Following arguments:\n"
               << " N, K, C, \n"
               << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
@@ -117,40 +117,40 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
     ck::utils::conv::ConvParams params;
     int arg_idx = 4;
 
-    params.num_dim_spatial = num_dim_spatial;
-    params.N               = std::stoi(argv[arg_idx++]);
-    params.K               = std::stoi(argv[arg_idx++]);
-    params.C               = std::stoi(argv[arg_idx++]);
+    params.num_dim_spatial_ = num_dim_spatial;
+    params.N_               = std::stoi(argv[arg_idx++]);
+    params.K_               = std::stoi(argv[arg_idx++]);
+    params.C_               = std::stoi(argv[arg_idx++]);
 
-    params.filter_spatial_lengths.resize(num_dim_spatial);
+    params.filter_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_spatial_lengths.resize(num_dim_spatial);
+    params.input_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_strides.resize(num_dim_spatial);
+    params.conv_filter_strides_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_dilations.resize(num_dim_spatial);
+    params.conv_filter_dilations_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_left_pads.resize(num_dim_spatial);
+    params.input_left_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_right_pads.resize(num_dim_spatial);
+    params.input_right_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
     }
 
     return params;
@@ -162,9 +162,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::utils::conv;
 
-    bool do_verification      = 0;
-    int init_method           = 0;
-    int nrepeat               = 5;
+    bool do_verification      = true;
+    int init_method           = 1;
+    bool time_kernel          = false;
     const int num_dim_spatial = 2;
 
     ck::utils::conv::ConvParams params;
@@ -173,7 +173,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
 
     if(argc >= 5)
@@ -181,21 +181,21 @@ int main(int argc, char* argv[])
         params = ParseConvParams(argc, argv);
     }
 
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
-                                        static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
+                                        static_cast<std::size_t>(params.C_)};
     input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths),
-                      std::end(params.input_spatial_lengths));
+                      std::begin(params.input_spatial_lengths_),
+                      std::end(params.input_spatial_lengths_));
 
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
-                                         static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
+                                         static_cast<std::size_t>(params.C_)};
     filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths),
-                       std::end(params.filter_spatial_lengths));
+                       std::begin(params.filter_spatial_lengths_),
+                       std::end(params.filter_spatial_lengths_));
 
     const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
-                                         static_cast<std::size_t>(params.K)};
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
+                                         static_cast<std::size_t>(params.K_)};
     output_dims.insert(std::end(output_dims),
                        std::begin(output_spatial_lengths),
                        std::end(output_spatial_lengths));
@@ -209,7 +209,7 @@ int main(int argc, char* argv[])
 
     // bias: assume contiguous 1d vector
     Tensor<OutDataType> bias(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K)})));
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
 
     // residual: assume same layout as output tensor
     Tensor<OutDataType> residual(get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
@@ -259,16 +259,16 @@ int main(int argc, char* argv[])
                           static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
                           static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
                           static_cast<const OutDataType*>(resi_device_buf.GetDeviceBuffer()),
-                          params.N,
-                          params.K,
-                          params.C,
-                          params.input_spatial_lengths,
-                          params.filter_spatial_lengths,
+                          params.N_,
+                          params.K_,
+                          params.C_,
+                          params.input_spatial_lengths_,
+                          params.filter_spatial_lengths_,
                           output_spatial_lengths,
-                          params.conv_filter_strides,
-                          params.conv_filter_dilations,
-                          params.input_left_pads,
-                          params.input_right_pads,
+                          params.conv_filter_strides_,
+                          params.conv_filter_dilations_,
+                          params.input_left_pads_,
+                          params.input_right_pads_,
                           in_element_op,
                           wei_element_op,
                           out_element_op);
@@ -280,20 +280,20 @@ int main(int argc, char* argv[])
             "not support this problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = get_flops(
-        params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
     std::size_t num_btype =
-        get_btype<InDataType, WeiDataType, OutDataType>(params.N,
-                                                        params.C,
-                                                        params.K,
-                                                        params.input_spatial_lengths,
-                                                        params.filter_spatial_lengths,
+        get_btype<InDataType, WeiDataType, OutDataType>(params.N_,
+                                                        params.C_,
+                                                        params.K_,
+                                                        params.input_spatial_lengths_,
+                                                        params.filter_spatial_lengths_,
                                                         output_spatial_lengths) +
-        sizeof(OutDataType) * (params.K) +
+        sizeof(OutDataType) * (params.K_) +
         sizeof(OutDataType) *
-            (params.N * params.K * output_spatial_lengths[0] * output_spatial_lengths[1]);
+            (params.N_ * params.K_ * output_spatial_lengths[0] * output_spatial_lengths[1]);
 
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
     float gb_per_sec = num_btype / 1.E6 / ave_time;
@@ -310,17 +310,18 @@ int main(int argc, char* argv[])
                                                   host_output,
                                                   bias,
                                                   residual,
-                                                  params.conv_filter_strides,
-                                                  params.conv_filter_dilations,
-                                                  params.input_left_pads,
-                                                  params.input_right_pads,
+                                                  params.conv_filter_strides_,
+                                                  params.conv_filter_dilations_,
+                                                  params.input_left_pads_,
+                                                  params.input_right_pads_,
                                                   in_element_op,
                                                   wei_element_op,
                                                   out_element_op);
 
         ref_invoker.Run(ref_argument);
         out_device_buf.FromDevice(device_output.mData.data());
-        ck::utils::check_err(
-            host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+        return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1;
     }
+
+    return 0;
 }
diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt
index f602862a04..ceceb4aedc 100644
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -1,6 +1,6 @@
-add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp)
-target_link_libraries(example_convnd_fwd_xdl PRIVATE conv_fwd_util)
+add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp)
 add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
-target_link_libraries(example_convnd_fwd_xdl_int8 PRIVATE conv_fwd_util)
 add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
-target_link_libraries(example_convnd_fwd_xdl_fp16 PRIVATE conv_fwd_util)
+target_link_libraries(example_convnd_fwd_xdl_fp32 PRIVATE conv_util)
+target_link_libraries(example_convnd_fwd_xdl_int8 PRIVATE conv_util)
+target_link_libraries(example_convnd_fwd_xdl_fp16 PRIVATE conv_util)
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
index eaa5683978..7ad83d5ad6 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -5,7 +5,7 @@
 
 #include "check_err.hpp"
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "device.hpp"
 #include "device_tensor.hpp"
 #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
@@ -43,10 +43,10 @@ template <ck::index_t NumDimSpatial>
 using DeviceConvNDFwdInstance = ck::tensor_operation::device::
     DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
         // clang-format off
-        InDataType,         // 
+        InDataType,         //
         WeiDataType,        //
         OutDataType,        //
-        AccDataType,        // 
+        AccDataType,        //
         InElementOp,        // Input Elementwise Operation
         WeiElementOp,       // Weights Elementwise Operation
         OutElementOp,       // Output Elementwise Operation
@@ -110,7 +110,7 @@ void print_use_msg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "arg4: N spatial dimensions (default 2)\n"
               << "Following arguments (depending on number of spatial dims):\n"
               << " N, K, C, \n"
@@ -137,40 +137,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
     ck::utils::conv::ConvParams params;
     int arg_idx = 5;
 
-    params.num_dim_spatial = num_dim_spatial;
-    params.N               = std::stoi(argv[arg_idx++]);
-    params.K               = std::stoi(argv[arg_idx++]);
-    params.C               = std::stoi(argv[arg_idx++]);
+    params.num_dim_spatial_ = num_dim_spatial;
+    params.N_               = std::stoi(argv[arg_idx++]);
+    params.K_               = std::stoi(argv[arg_idx++]);
+    params.C_               = std::stoi(argv[arg_idx++]);
 
-    params.filter_spatial_lengths.resize(num_dim_spatial);
+    params.filter_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_spatial_lengths.resize(num_dim_spatial);
+    params.input_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_strides.resize(num_dim_spatial);
+    params.conv_filter_strides_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_dilations.resize(num_dim_spatial);
+    params.conv_filter_dilations_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_left_pads.resize(num_dim_spatial);
+    params.input_left_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_right_pads.resize(num_dim_spatial);
+    params.input_right_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
     }
 
     return params;
@@ -182,9 +182,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::utils::conv;
 
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
     int num_dim_spatial  = 2;
 
     ck::utils::conv::ConvParams params;
@@ -193,7 +193,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         num_dim_spatial = std::stoi(argv[4]);
     }
 
@@ -202,21 +202,21 @@ int main(int argc, char* argv[])
         params = parse_conv_params(num_dim_spatial, argc, argv);
     }
 
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
-                                        static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
+                                        static_cast<std::size_t>(params.C_)};
     input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths),
-                      std::end(params.input_spatial_lengths));
+                      std::begin(params.input_spatial_lengths_),
+                      std::end(params.input_spatial_lengths_));
 
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
-                                         static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
+                                         static_cast<std::size_t>(params.C_)};
     filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths),
-                       std::end(params.filter_spatial_lengths));
+                       std::begin(params.filter_spatial_lengths_),
+                       std::end(params.filter_spatial_lengths_));
 
     const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
-                                         static_cast<std::size_t>(params.K)};
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
+                                         static_cast<std::size_t>(params.K_)};
     output_dims.insert(std::end(output_dims),
                        std::begin(output_spatial_lengths),
                        std::end(output_spatial_lengths));
@@ -256,16 +256,16 @@ int main(int argc, char* argv[])
         conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                   static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                   static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                  params.N,
-                                  params.K,
-                                  params.C,
-                                  params.input_spatial_lengths,
-                                  params.filter_spatial_lengths,
+                                  params.N_,
+                                  params.K_,
+                                  params.C_,
+                                  params.input_spatial_lengths_,
+                                  params.filter_spatial_lengths_,
                                   output_spatial_lengths,
-                                  params.conv_filter_strides,
-                                  params.conv_filter_dilations,
-                                  params.input_left_pads,
-                                  params.input_right_pads,
+                                  params.conv_filter_strides_,
+                                  params.conv_filter_dilations_,
+                                  params.input_left_pads_,
+                                  params.input_right_pads_,
                                   InElementOp{},
                                   WeiElementOp{},
                                   OutElementOp{});
@@ -277,16 +277,16 @@ int main(int argc, char* argv[])
             "not support this Conv problem");
     }
 
-    float ave_time = invoker->Run(argument.get(), nrepeat);
+    float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = get_flops(
-        params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
     std::size_t num_btype = get_btype<InDataType, WeiDataType, OutDataType>(
-        params.N,
-        params.C,
-        params.K,
-        params.input_spatial_lengths,
-        params.filter_spatial_lengths,
+        params.N_,
+        params.C_,
+        params.K_,
+        params.input_spatial_lengths_,
+        params.filter_spatial_lengths_,
         output_spatial_lengths);
 
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -302,18 +302,18 @@ int main(int argc, char* argv[])
             auto ref_argument = ref_conv.MakeArgument(input,
                                                       weights,
                                                       host_output,
-                                                      params.conv_filter_strides,
-                                                      params.conv_filter_dilations,
-                                                      params.input_left_pads,
-                                                      params.input_right_pads,
+                                                      params.conv_filter_strides_,
+                                                      params.conv_filter_dilations_,
+                                                      params.input_left_pads_,
+                                                      params.input_right_pads_,
                                                       InElementOp{},
                                                       WeiElementOp{},
                                                       OutElementOp{});
 
             ref_invoker.Run(ref_argument);
             out_device_buf.FromDevice(device_output.mData.data());
-            ck::utils::check_err(
-                host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+            return ck::utils::check_err(
+                host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f) ? 0 : 1;
         };
 
         switch(num_dim_spatial)
@@ -338,4 +338,5 @@ int main(int argc, char* argv[])
         }
         }
     }
+    return 0;
 }
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
similarity index 80%
rename from example/09_convnd_fwd/convnd_fwd_xdl.cpp
rename to example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
index e8895b8639..8a9633d84a 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
@@ -5,7 +5,7 @@
 
 #include "check_err.hpp"
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "device.hpp"
 #include "device_tensor.hpp"
 #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
@@ -39,10 +39,10 @@ template <ck::index_t NumDimSpatial>
 using DeviceConvNDFwdInstance = ck::tensor_operation::device::
     DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
         // clang-format off
-        InDataType,         // 
+        InDataType,         //
         WeiDataType,        //
         OutDataType,        //
-        AccDataType,        // 
+        AccDataType,        //
         InElementOp,        // Input Elementwise Operation
         WeiElementOp,       // Weights Elementwise Operation
         OutElementOp,       // Output Elementwise Operation
@@ -107,7 +107,7 @@ void print_use_msg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "arg4: N spatial dimensions (default 2)\n"
               << "Following arguments (depending on number of spatial dims):\n"
               << " N, K, C, \n"
@@ -134,40 +134,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
     ck::utils::conv::ConvParams params;
     int arg_idx = 5;
 
-    params.num_dim_spatial = num_dim_spatial;
-    params.N               = std::stoi(argv[arg_idx++]);
-    params.K               = std::stoi(argv[arg_idx++]);
-    params.C               = std::stoi(argv[arg_idx++]);
+    params.num_dim_spatial_ = num_dim_spatial;
+    params.N_               = std::stoi(argv[arg_idx++]);
+    params.K_               = std::stoi(argv[arg_idx++]);
+    params.C_               = std::stoi(argv[arg_idx++]);
 
-    params.filter_spatial_lengths.resize(num_dim_spatial);
+    params.filter_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_spatial_lengths.resize(num_dim_spatial);
+    params.input_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_strides.resize(num_dim_spatial);
+    params.conv_filter_strides_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_dilations.resize(num_dim_spatial);
+    params.conv_filter_dilations_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_left_pads.resize(num_dim_spatial);
+    params.input_left_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_right_pads.resize(num_dim_spatial);
+    params.input_right_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
     }
 
     return params;
@@ -179,9 +179,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::utils::conv;
 
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
     int num_dim_spatial  = 2;
 
     ck::utils::conv::ConvParams params;
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         num_dim_spatial = std::stoi(argv[4]);
     }
 
@@ -199,21 +199,21 @@ int main(int argc, char* argv[])
         params = parse_conv_params(num_dim_spatial, argc, argv);
     }
 
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
-                                        static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
+                                        static_cast<std::size_t>(params.C_)};
     input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths),
-                      std::end(params.input_spatial_lengths));
+                      std::begin(params.input_spatial_lengths_),
+                      std::end(params.input_spatial_lengths_));
 
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
-                                         static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
+                                         static_cast<std::size_t>(params.C_)};
     filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths),
-                       std::end(params.filter_spatial_lengths));
+                       std::begin(params.filter_spatial_lengths_),
+                       std::end(params.filter_spatial_lengths_));
 
     const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
-                                         static_cast<std::size_t>(params.K)};
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
+                                         static_cast<std::size_t>(params.K_)};
     output_dims.insert(std::end(output_dims),
                        std::begin(output_spatial_lengths),
                        std::end(output_spatial_lengths));
@@ -255,16 +255,16 @@ int main(int argc, char* argv[])
         conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                   static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                   static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                  params.N,
-                                  params.K,
-                                  params.C,
-                                  params.input_spatial_lengths,
-                                  params.filter_spatial_lengths,
+                                  params.N_,
+                                  params.K_,
+                                  params.C_,
+                                  params.input_spatial_lengths_,
+                                  params.filter_spatial_lengths_,
                                   output_spatial_lengths,
-                                  params.conv_filter_strides,
-                                  params.conv_filter_dilations,
-                                  params.input_left_pads,
-                                  params.input_right_pads,
+                                  params.conv_filter_strides_,
+                                  params.conv_filter_dilations_,
+                                  params.input_left_pads_,
+                                  params.input_right_pads_,
                                   InElementOp{},
                                   WeiElementOp{},
                                   OutElementOp{});
@@ -276,16 +276,16 @@ int main(int argc, char* argv[])
             "not support this Conv problem");
     }
 
-    float ave_time = invoker->Run(argument.get(), nrepeat);
+    float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = get_flops(
-        params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
     std::size_t num_btype =
-        get_btype<InDataType, WeiDataType, OutDataType>(params.N,
-                                                        params.C,
-                                                        params.K,
-                                                        params.input_spatial_lengths,
-                                                        params.filter_spatial_lengths,
+        get_btype<InDataType, WeiDataType, OutDataType>(params.N_,
+                                                        params.C_,
+                                                        params.K_,
+                                                        params.input_spatial_lengths_,
+                                                        params.filter_spatial_lengths_,
                                                         output_spatial_lengths);
 
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -301,18 +301,23 @@ int main(int argc, char* argv[])
             auto ref_argument = ref_conv.MakeArgument(input,
                                                       weights,
                                                       host_output,
-                                                      params.conv_filter_strides,
-                                                      params.conv_filter_dilations,
-                                                      params.input_left_pads,
-                                                      params.input_right_pads,
+                                                      params.conv_filter_strides_,
+                                                      params.conv_filter_dilations_,
+                                                      params.input_left_pads_,
+                                                      params.input_right_pads_,
                                                       InElementOp{},
                                                       WeiElementOp{},
                                                       OutElementOp{});
 
             ref_invoker.Run(ref_argument);
             out_device_buf.FromDevice(device_output.mData.data());
-            ck::utils::check_err(
-                host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+            return ck::utils::check_err(device_output.mData,
+                                        host_output.mData,
+                                        "Error: incorrect results!",
+                                        1e-5f,
+                                        1e-4f)
+                       ? 0
+                       : 1;
         };
 
         switch(num_dim_spatial)
@@ -337,4 +342,5 @@ int main(int argc, char* argv[])
         }
         }
     }
+    return 0;
 }
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
index 34b4645770..f196d27182 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -5,7 +5,7 @@
 
 #include "check_err.hpp"
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "device.hpp"
 #include "device_tensor.hpp"
 #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
@@ -45,10 +45,10 @@ template <ck::index_t NumDimSpatial>
 using DeviceConvNDFwdInstance = ck::tensor_operation::device::
     DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
         // clang-format off
-        InDataType,         // 
+        InDataType,         //
         WeiDataType,        //
         OutDataType,        //
-        AccDataType,        // 
+        AccDataType,        //
         InElementOp,        // Input Elementwise Operation
         WeiElementOp,       // Weights Elementwise Operation
         OutElementOp,       // Output Elementwise Operation
@@ -112,7 +112,7 @@ void print_use_msg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "arg4: N spatial dimensions (default 2)\n"
               << "Following arguments (depending on number of spatial dims):\n"
               << " N, K, C, \n"
@@ -139,40 +139,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
     ck::utils::conv::ConvParams params;
     int arg_idx = 5;
 
-    params.num_dim_spatial = num_dim_spatial;
-    params.N               = std::stoi(argv[arg_idx++]);
-    params.K               = std::stoi(argv[arg_idx++]);
-    params.C               = std::stoi(argv[arg_idx++]);
+    params.num_dim_spatial_ = num_dim_spatial;
+    params.N_               = std::stoi(argv[arg_idx++]);
+    params.K_               = std::stoi(argv[arg_idx++]);
+    params.C_               = std::stoi(argv[arg_idx++]);
 
-    params.filter_spatial_lengths.resize(num_dim_spatial);
+    params.filter_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_spatial_lengths.resize(num_dim_spatial);
+    params.input_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_strides.resize(num_dim_spatial);
+    params.conv_filter_strides_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_dilations.resize(num_dim_spatial);
+    params.conv_filter_dilations_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_left_pads.resize(num_dim_spatial);
+    params.input_left_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_right_pads.resize(num_dim_spatial);
+    params.input_right_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
     }
 
     return params;
@@ -184,9 +184,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::utils::conv;
 
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
     int num_dim_spatial  = 2;
 
     ck::utils::conv::ConvParams params;
@@ -195,7 +195,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         num_dim_spatial = std::stoi(argv[4]);
     }
 
@@ -204,21 +204,21 @@ int main(int argc, char* argv[])
         params = parse_conv_params(num_dim_spatial, argc, argv);
     }
 
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
-                                        static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
+                                        static_cast<std::size_t>(params.C_)};
     input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths),
-                      std::end(params.input_spatial_lengths));
+                      std::begin(params.input_spatial_lengths_),
+                      std::end(params.input_spatial_lengths_));
 
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
-                                         static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
+                                         static_cast<std::size_t>(params.C_)};
     filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths),
-                       std::end(params.filter_spatial_lengths));
+                       std::begin(params.filter_spatial_lengths_),
+                       std::end(params.filter_spatial_lengths_));
 
     const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
-                                         static_cast<std::size_t>(params.K)};
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
+                                         static_cast<std::size_t>(params.K_)};
     output_dims.insert(std::end(output_dims),
                        std::begin(output_spatial_lengths),
                        std::end(output_spatial_lengths));
@@ -258,16 +258,16 @@ int main(int argc, char* argv[])
         conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                   static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                   static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                  params.N,
-                                  params.K,
-                                  params.C,
-                                  params.input_spatial_lengths,
-                                  params.filter_spatial_lengths,
+                                  params.N_,
+                                  params.K_,
+                                  params.C_,
+                                  params.input_spatial_lengths_,
+                                  params.filter_spatial_lengths_,
                                   output_spatial_lengths,
-                                  params.conv_filter_strides,
-                                  params.conv_filter_dilations,
-                                  params.input_left_pads,
-                                  params.input_right_pads,
+                                  params.conv_filter_strides_,
+                                  params.conv_filter_dilations_,
+                                  params.input_left_pads_,
+                                  params.input_right_pads_,
                                   InElementOp{},
                                   WeiElementOp{},
                                   OutElementOp{});
@@ -279,16 +279,16 @@ int main(int argc, char* argv[])
             "not support this Conv problem");
     }
 
-    float ave_time = invoker->Run(argument.get(), nrepeat);
+    float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = get_flops(
-        params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
     std::size_t num_btype = get_btype<InDataType, WeiDataType, OutDataType>(
-        params.N,
-        params.C,
-        params.K,
-        params.input_spatial_lengths,
-        params.filter_spatial_lengths,
+        params.N_,
+        params.C_,
+        params.K_,
+        params.input_spatial_lengths_,
+        params.filter_spatial_lengths_,
         output_spatial_lengths);
 
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -304,18 +304,18 @@ int main(int argc, char* argv[])
             auto ref_argument = ref_conv.MakeArgument(input,
                                                       weights,
                                                       host_output,
-                                                      params.conv_filter_strides,
-                                                      params.conv_filter_dilations,
-                                                      params.input_left_pads,
-                                                      params.input_right_pads,
+                                                      params.conv_filter_strides_,
+                                                      params.conv_filter_dilations_,
+                                                      params.input_left_pads_,
+                                                      params.input_right_pads_,
                                                       InElementOp{},
                                                       WeiElementOp{},
                                                       OutElementOp{});
 
             ref_invoker.Run(ref_argument);
             out_device_buf.FromDevice(device_output.mData.data());
-            ck::utils::check_err(
-                host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+            return ck::utils::check_err(
+                host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f) ? 0 : 1;
         };
 
         switch(num_dim_spatial)
@@ -340,4 +340,5 @@ int main(int argc, char* argv[])
         }
         }
     }
+    return 0;
 }
diff --git a/example/10_conv2d_bwd_data/CMakeLists.txt b/example/10_conv2d_bwd_data/CMakeLists.txt
index f300bc9645..17aca1481b 100644
--- a/example/10_conv2d_bwd_data/CMakeLists.txt
+++ b/example/10_conv2d_bwd_data/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_example_executable(example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp)
-target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_fwd_util)
+target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_util)
diff --git a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
index f3f9b497f5..2d25f5ac2f 100644
--- a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+++ b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
@@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // Conv shape
     ck::index_t N               = 128;
@@ -102,13 +102,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 19)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         N               = std::stoi(argv[4]);
         K               = std::stoi(argv[5]);
@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(0);
@@ -214,7 +214,7 @@ int main(int argc, char* argv[])
             "not support this Conv problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
@@ -249,6 +249,10 @@ int main(int argc, char* argv[])
 
         in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
 
-        ck::utils::check_err(in_n_c_hi_wi_device_result.mData, in_n_c_hi_wi_host_result.mData);
+        return ck::utils::check_err(in_n_c_hi_wi_device_result.mData,
+                                    in_n_c_hi_wi_host_result.mData)
+                   ? 0
+                   : 1;
     }
+    return 0;
 }
diff --git a/example/11_conv2d_bwd_weight/CMakeLists.txt b/example/11_conv2d_bwd_weight/CMakeLists.txt
index ff001eab72..3d771b5569 100644
--- a/example/11_conv2d_bwd_weight/CMakeLists.txt
+++ b/example/11_conv2d_bwd_weight/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_example_executable(example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp)
-target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_fwd_util)
+target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_util)
diff --git a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
index bf78cc87e0..1578161116 100644
--- a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+++ b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
@@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance =
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
     int do_log           = 0;
     int split_k          = 4;
 
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         do_log          = std::stoi(argv[4]);
         split_k         = std::stoi(argv[5]);
     }
@@ -117,7 +117,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         do_log          = std::stoi(argv[4]);
         split_k         = std::stoi(argv[5]);
 
@@ -141,7 +141,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4: is show log (0=no, 1=yes)\n");
         printf("arg5: split-k \n");
         printf("arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
@@ -246,7 +246,7 @@ int main(int argc, char* argv[])
         return 1;
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
@@ -291,6 +291,9 @@ int main(int argc, char* argv[])
             LogRangeAsType<float>(std::cout << "wei_host  : ", wei_k_c_y_x_host_result.mData, ",")
                 << std::endl;
         }
-        ck::utils::check_err(wei_k_c_y_x_device_result.mData, wei_k_c_y_x_host_result.mData);
+        return ck::utils::check_err(wei_k_c_y_x_device_result.mData, wei_k_c_y_x_host_result.mData)
+                   ? 0
+                   : 1;
     }
+    return 0;
 }
diff --git a/example/12_reduce/CMakeLists.txt b/example/12_reduce/CMakeLists.txt
index 734c1955d6..d6866abeb8 100644
--- a/example/12_reduce/CMakeLists.txt
+++ b/example/12_reduce/CMakeLists.txt
@@ -1 +1 @@
-add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
+add_example_executable(example_reduce_blockwise reduce_blockwise.cpp -D 16,64,32,960 -v 1 1 10)
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index 7ca9823ff5..b2d312ae8c 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -116,10 +116,9 @@ class SimpleAppArgs
     std::vector<size_t> inLengths;
     std::vector<float> scales;
 
-    bool do_verification = false;
-
-    int init_method = 1;
-    int nrepeat     = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     public:
     void show_usage(const char* cmd)
@@ -135,7 +134,7 @@ class SimpleAppArgs
         std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
                      "value, 3=decimal value)"
                   << std::endl;
-        std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl;
+        std::cout << "Arg2 -- time kernel (0=n0, 1=yes)" << std::endl;
     };
 
     int processArgs(int argc, char* argv[])
@@ -182,7 +181,7 @@ class SimpleAppArgs
             throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
 
         init_method = std::atoi(argv[optind++]);
-        nrepeat     = std::atoi(argv[optind]);
+        time_kernel = std::atoi(argv[optind]);
 
         if(scales.empty())
         {
@@ -352,7 +351,7 @@ int main(int argc, char* argv[])
 
     auto invoker_ptr = reduce.MakeInvokerPointer();
 
-    float avg_time = invoker_ptr->Run(argument_ptr.get(), args.nrepeat);
+    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel});
 
     std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
                             invariant_total_length * sizeof(OutDataType);
@@ -362,16 +361,17 @@ int main(int argc, char* argv[])
     std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
               << std::endl;
 
+    bool pass = true;
     if(args.do_verification)
     {
         out_dev.FromDevice(out.mData.data());
-        ck::utils::check_err(out.mData, out_ref.mData);
+        pass &= ck::utils::check_err(out.mData, out_ref.mData);
 
         if(NeedIndices)
         {
             out_indices_dev.FromDevice(out_indices.mData.data());
-            ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
-            ;
+            pass &= ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
         };
     };
+    return pass ? 0 : 1;
 }
diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd.cpp
index a18761095c..e6749bf8d7 100644
--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd.cpp
@@ -149,9 +149,9 @@ int main(int argc, char* argv[])
 {
     using namespace ck::host_reduce;
 
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // Pool shape
     ck::index_t N               = 128;
@@ -171,13 +171,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 16)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         N               = std::stoi(argv[4]);
         C               = std::stoi(argv[5]);
@@ -196,7 +196,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(0);
@@ -271,7 +271,7 @@ int main(int argc, char* argv[])
                                  "not support this problem");
     }
 
-    float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+    float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X;
 
@@ -285,6 +285,7 @@ int main(int argc, char* argv[])
     std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
               << std::endl;
 
+    bool pass = true;
     if(do_verification)
     {
         pool_host_verify<InDataType,
@@ -302,14 +303,15 @@ int main(int argc, char* argv[])
 
         out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
 
-        ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData);
+        pass &= ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData);
 
         if constexpr(NeedIndices)
         {
             out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
 
-            //          ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
-            //          out_indices_n_c_ho_wo_host.mData);;
+            pass &= ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
+                                         out_indices_n_c_ho_wo_host.mData);
         };
     }
+    return pass ? 0 : 1;
 }
diff --git a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
index 324dc35d3f..9f6408a84a 100644
--- a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
+++ b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
@@ -105,9 +105,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -125,13 +125,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -145,7 +145,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -219,7 +219,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
@@ -244,7 +244,7 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
-        ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+        return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
     }
 
     return 0;
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
index 29ef01f2ef..8c3491c8c9 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -60,21 +60,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     if(argc == 4)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         exit(0);
     }
 
@@ -202,7 +202,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
 
@@ -211,6 +211,7 @@ int main(int argc, char* argv[])
     std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
               << gemm.GetTypeString() << std::endl;
 
+    bool pass = true;
     if(do_verification)
     {
         for(std::size_t i = 0; i < gemm_shapes.size(); i++)
@@ -227,9 +228,9 @@ int main(int argc, char* argv[])
                                                       c_element_op);
 
             ref_invoker.Run(ref_argument);
-            ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData);
+            pass &= ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData);
         }
     }
 
-    return 0;
+    return pass ? 0 : 1;
 }
diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
index 90064ae584..860d9eea2a 100644
--- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
@@ -4,6 +4,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <half.hpp>
+#include "check_err.hpp"
 #include "config.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
@@ -58,9 +59,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 1;
+    bool do_verification = true;
     int init_method      = 1;
-    int nrepeat          = 5;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -79,13 +80,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -99,7 +100,7 @@ int main(int argc, char* argv[])
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
         exit(0);
     }
@@ -192,30 +193,13 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    // warm up
-    invoker.Run(argument);
+    // init DO, D1 to 0
+    d0_device_buf.SetZero();
+    d1_device_buf.SetZero();
 
-    // timing
-    float total_time = 0;
-
-    for(int i = 0; i < nrepeat; ++i)
-    {
-        // init DO, D1 to 0
-        d0_device_buf.SetZero();
-        d1_device_buf.SetZero();
-
-        KernelTimer timer;
-
-        timer.Start();
-
-        invoker.Run(argument);
-
-        timer.End();
-
-        total_time += timer.GetElapsedTime();
-    }
-
-    float ave_time = total_time / nrepeat;
+    // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
+    // will not be correct. need to set time_kernel = false for correctness test
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
@@ -228,6 +212,7 @@ int main(int argc, char* argv[])
     std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
               << gemm.GetTypeString() << std::endl;
 
+    bool pass = true;
     if(do_verification)
     {
         c_device_buf.FromDevice(c_m_n_device_result.mData.data());
@@ -264,10 +249,19 @@ int main(int argc, char* argv[])
             d1_m_host_result(m) = ck::type_convert<DDataType>(d1_acc);
         }
 
-        check_error(c_m_n_host_result, c_m_n_device_result);
-        check_error(d0_m_host_result, d0_m_device_result);
-        check_error(d1_m_host_result, d1_m_device_result);
+        pass &= ck::utils::check_err(
+            c_m_n_device_result.mData, c_m_n_host_result.mData, "Error: Incorrect results c");
+        pass &= ck::utils::check_err(d0_m_device_result.mData,
+                                     d0_m_host_result.mData,
+                                     "Error: Incorrect results d0",
+                                     1e-3,
+                                     1e-3);
+        pass &= ck::utils::check_err(d1_m_device_result.mData,
+                                     d1_m_host_result.mData,
+                                     "Error: Incorrect results d1",
+                                     1e-3,
+                                     1e-3);
     }
 
-    return 0;
+    return pass ? 0 : 1;
 }
diff --git a/example/17_convnd_bwd_data_xdl/CMakeLists.txt b/example/17_convnd_bwd_data_xdl/CMakeLists.txt
index 0ed906f8f7..963f311703 100644
--- a/example/17_convnd_bwd_data_xdl/CMakeLists.txt
+++ b/example/17_convnd_bwd_data_xdl/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_example_executable(example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp)
-target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_fwd_util)
+target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_util)
diff --git a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
index 962627ce90..ff2cfac1fa 100644
--- a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
+++ b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
@@ -6,7 +6,7 @@
 #include <half.hpp>
 
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "print.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
@@ -87,7 +87,7 @@ void print_use_msg()
 {
     std::cout << "arg1: verification (0=no, 1=yes)\n"
               << "arg2: initialization (0=no init, 1=random value, 2= init to 1 )\n"
-              << "arg3: run kernel # of times (>1)\n"
+              << "arg3: time kernel (0=n0, 1=yes)\n"
               << "arg4: N spatial dimensions (default 2)\n"
               << "Following arguments (depending on number of spatial dims):\n"
               << " N, K, C, \n"
@@ -105,40 +105,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[])
     ck::utils::conv::ConvParams params;
     int arg_idx = 5;
 
-    params.num_dim_spatial = num_dim_spatial;
-    params.N               = std::stoi(argv[arg_idx++]);
-    params.K               = std::stoi(argv[arg_idx++]);
-    params.C               = std::stoi(argv[arg_idx++]);
+    params.num_dim_spatial_ = num_dim_spatial;
+    params.N_               = std::stoi(argv[arg_idx++]);
+    params.K_               = std::stoi(argv[arg_idx++]);
+    params.C_               = std::stoi(argv[arg_idx++]);
 
-    params.filter_spatial_lengths.resize(num_dim_spatial);
+    params.filter_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_spatial_lengths.resize(num_dim_spatial);
+    params.input_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_strides.resize(num_dim_spatial);
+    params.conv_filter_strides_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_dilations.resize(num_dim_spatial);
+    params.conv_filter_dilations_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_left_pads.resize(num_dim_spatial);
+    params.input_left_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_right_pads.resize(num_dim_spatial);
+    params.input_right_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
     }
 
     return params;
@@ -165,25 +165,25 @@ DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial)
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
     int num_dim_spatial  = 2;
 
     ck::utils::conv::ConvParams params;
-    params.C = 128;
+    params.C_ = 128;
 
     if(argc == 4)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc > 4)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         num_dim_spatial = std::stoi(argv[4]);
         // check args number
         int conv_args     = 3 + num_dim_spatial * 6;
@@ -202,21 +202,21 @@ int main(int argc, char* argv[])
         exit(1);
     }
 
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
-                                        static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
+                                        static_cast<std::size_t>(params.C_)};
     input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths),
-                      std::end(params.input_spatial_lengths));
+                      std::begin(params.input_spatial_lengths_),
+                      std::end(params.input_spatial_lengths_));
 
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
-                                         static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
+                                         static_cast<std::size_t>(params.C_)};
     filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths),
-                       std::end(params.filter_spatial_lengths));
+                       std::begin(params.filter_spatial_lengths_),
+                       std::end(params.filter_spatial_lengths_));
 
     const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
-                                         static_cast<std::size_t>(params.K)};
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
+                                         static_cast<std::size_t>(params.K_)};
     output_dims.insert(std::end(output_dims),
                        std::begin(output_spatial_lengths),
                        std::end(output_spatial_lengths));
@@ -263,16 +263,16 @@ int main(int argc, char* argv[])
         conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                   static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                   static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                  params.N,
-                                  params.K,
-                                  params.C,
-                                  params.input_spatial_lengths,
-                                  params.filter_spatial_lengths,
+                                  params.N_,
+                                  params.K_,
+                                  params.C_,
+                                  params.input_spatial_lengths_,
+                                  params.filter_spatial_lengths_,
                                   output_spatial_lengths,
-                                  params.conv_filter_strides,
-                                  params.conv_filter_dilations,
-                                  params.input_left_pads,
-                                  params.input_right_pads,
+                                  params.conv_filter_strides_,
+                                  params.conv_filter_dilations_,
+                                  params.input_left_pads_,
+                                  params.input_right_pads_,
                                   InElementOp{},
                                   WeiElementOp{},
                                   OutElementOp{});
@@ -284,16 +284,16 @@ int main(int argc, char* argv[])
             "not support this Conv problem");
     }
 
-    float ave_time = invoker->Run(argument.get(), nrepeat);
+    float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = ck::utils::conv::get_flops(
-        params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
     std::size_t num_btype = ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
-        params.N,
-        params.C,
-        params.K,
-        params.input_spatial_lengths,
-        params.filter_spatial_lengths,
+        params.N_,
+        params.C_,
+        params.K_,
+        params.input_spatial_lengths_,
+        params.filter_spatial_lengths_,
         output_spatial_lengths);
 
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -310,10 +310,10 @@ int main(int argc, char* argv[])
             auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi_host_result,
                                                       wei_k_c_y_x,
                                                       out_n_k_ho_wo,
-                                                      params.conv_filter_strides,
-                                                      params.conv_filter_dilations,
-                                                      params.input_left_pads,
-                                                      params.input_right_pads,
+                                                      params.conv_filter_strides_,
+                                                      params.conv_filter_dilations_,
+                                                      params.input_left_pads_,
+                                                      params.input_right_pads_,
                                                       InElementOp{},
                                                       WeiElementOp{},
                                                       OutElementOp{});
@@ -322,7 +322,10 @@ int main(int argc, char* argv[])
 
             in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
 
-            check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result);
+            return ck::utils::check_err(in_n_c_hi_wi_device_result.mData,
+                                        in_n_c_hi_wi_host_result.mData)
+                       ? 0
+                       : 1;
         };
 
         switch(num_dim_spatial)
@@ -347,4 +350,5 @@ int main(int argc, char* argv[])
         }
         }
     }
+    return 0;
 }
diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
index eb18655d1b..d993c8e8d1 100644
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -4,6 +4,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <half.hpp>
+#include "check_err.hpp"
 #include "config.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
@@ -57,18 +58,18 @@ using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 1;
+    bool do_verification = true;
     int init_method      = 1;
-    int nrepeat          = 5;
+    bool time_kernel     = false;
 
     // GEMM shape
-    ck::index_t M = 3840;
-    ck::index_t N = 4096;
-    ck::index_t K = 4096;
+    ck::index_t M = 2048;
+    ck::index_t N = 1920;
+    ck::index_t K = 2048;
 
-    ck::index_t StrideA = 4096;
-    ck::index_t StrideB = 4096;
-    ck::index_t StrideC = 4096;
+    ck::index_t StrideA = 2048;
+    ck::index_t StrideB = 2048;
+    ck::index_t StrideC = 1920;
 
     ck::index_t BatchCount = 4;
 
@@ -80,13 +81,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 11)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -96,13 +97,13 @@ int main(int argc, char* argv[])
         StrideB = std::stoi(argv[8]);
         StrideC = std::stoi(argv[9]);
 
-        BatchCount = std::stoi(argv[9]);
+        BatchCount = std::stoi(argv[10]);
     }
     else
     {
         printf("arg1: verification (0=no, 1=yes)\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
         printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, BatchCount\n");
         exit(0);
     }
@@ -204,30 +205,13 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    // warm up
-    invoker.Run(argument);
+    // init DO, D1 to 0
+    d0_device_buf.SetZero();
+    d1_device_buf.SetZero();
 
-    // timing
-    float total_time = 0;
-
-    for(int i = 0; i < nrepeat; ++i)
-    {
-        // init DO, D1 to 0
-        d0_device_buf.SetZero();
-        d1_device_buf.SetZero();
-
-        KernelTimer timer;
-
-        timer.Start();
-
-        invoker.Run(argument);
-
-        timer.End();
-
-        total_time += timer.GetElapsedTime();
-    }
-
-    float ave_time = total_time / nrepeat;
+    // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
+    // will not be correct. need to set time_kernel = false for correctness test
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop      = std::size_t(2) * BatchCount * M * N * K;
     std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K +
@@ -241,6 +225,7 @@ int main(int argc, char* argv[])
     std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
               << batched_gemm.GetTypeString() << std::endl;
 
+    bool pass = true;
     if(do_verification)
     {
         c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
@@ -264,7 +249,7 @@ int main(int argc, char* argv[])
 
                 for(int n = 0; n < N; ++n)
                 {
-                    float d0_val = ck::type_convert<float>(c_g_m_n_host_result(m, n));
+                    float d0_val = ck::type_convert<float>(c_g_m_n_host_result(batch, m, n));
                     float d1_val;
 
                     d1_element_op(d1_val, d0_val);
@@ -277,10 +262,18 @@ int main(int argc, char* argv[])
             }
         }
 
-        check_error(c_g_m_n_host_result, c_g_m_n_device_result);
-        check_error(d0_g_m_host_result, d0_g_m_device_result);
-        check_error(d1_g_m_host_result, d1_g_m_device_result);
+        pass &= ck::utils::check_err(c_g_m_n_host_result.mData, c_g_m_n_device_result.mData);
+        pass &= ck::utils::check_err(d0_g_m_device_result.mData,
+                                     d0_g_m_host_result.mData,
+                                     "Error: Incorrect results! D0",
+                                     1e-3,
+                                     1e-3);
+        pass &= ck::utils::check_err(d1_g_m_device_result.mData,
+                                     d1_g_m_host_result.mData,
+                                     "Error: Incorrect results! D1",
+                                     1e-3,
+                                     1e-3);
     }
 
-    return 0;
+    return pass ? 0 : 1;
 }
diff --git a/example/19_cgemm/cgemm_xdl_bf16.cpp b/example/19_cgemm/cgemm_xdl_bf16.cpp
index 309fa6ac86..836a3c13dc 100644
--- a/example/19_cgemm/cgemm_xdl_bf16.cpp
+++ b/example/19_cgemm/cgemm_xdl_bf16.cpp
@@ -88,9 +88,9 @@ using ReferenceCGemmInstance = ck::tensor_operation::host::
 
 int main(int argc, char* argv[])
 {
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
 
     // CGEMM shape
     ck::index_t M = 3840;
@@ -105,13 +105,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -223,7 +223,7 @@ int main(int argc, char* argv[])
             "not support this CGEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop      = std::size_t(8) * M * N * K;
     std::size_t num_btype = std::size_t(2) * sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 5ea3889844..051242ce2a 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -19,9 +19,18 @@ include_directories(BEFORE
 
 add_custom_target(examples)
 
-function(add_example_executable EXAMPLE_NAME)
+function(add_example_executable EXAMPLE_NAME FILE_NAME)
     message("adding example ${EXAMPLE_NAME}")
-    add_executable(${EXAMPLE_NAME} ${ARGN})
+    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
+    target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
+    add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
+    add_dependencies(examples ${EXAMPLE_NAME})
+    add_dependencies(check ${EXAMPLE_NAME})
+endfunction(add_example_executable EXAMPLE_NAME)
+
+function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
+    message("adding example ${EXAMPLE_NAME}")
+    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
     target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
     add_dependencies(examples ${EXAMPLE_NAME})
 endfunction(add_example_executable EXAMPLE_NAME)
diff --git a/include/ck/config.hpp b/include/ck/config.hpp
index e6deefcbe3..710cd552d7 100644
--- a/include/ck/config.hpp
+++ b/include/ck/config.hpp
@@ -109,6 +109,10 @@
 // experimental feature: use __builtin_memcpy instead of union to do bit_cast
 #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
 
+// experimental feature: optimize for inter-wave scheduling policy
+#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 0
+#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
+
 // hack: have underlying assumption that need to be satsified, otherwise it's a bug
 // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
 // thread-invariant, otherwise it's a bug
diff --git a/include/ck/hip_version.hpp.in b/include/ck/hip_version.hpp.in
deleted file mode 100644
index 4290ef7e0d..0000000000
--- a/include/ck/hip_version.hpp.in
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-// "_PACKAGE_" to avoid name contentions: the macros like
-// HIP_VERSION_MAJOR are defined in HIP_VERSION.h.
-// clang-format off
-#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@
-#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@
-#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@
-// clang-format on
-
-#ifndef CK_HIP_PACKAGE_VERSION_MAJOR
-#define CK_HIP_PACKAGE_VERSION_MAJOR 0
-#endif
-#ifndef CK_HIP_PACKAGE_VERSION_MINOR
-#define CK_HIP_PACKAGE_VERSION_MINOR 0
-#endif
-#ifndef CK_HIP_PACKAGE_VERSION_PATCH
-#define CK_HIP_PACKAGE_VERSION_PATCH 0
-#endif
-// 3 decimal digits for major and minor, 6 digits for patch number.
-// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math.
-#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \
-    CK_HIP_PACKAGE_VERSION_PATCH > 999999
-#error "Too big HIP version number(s)"
-#endif
-#define CK_HIP_PACKAGE_VERSION_FLAT                                                      \
-    ((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
-     CK_HIP_PACKAGE_VERSION_PATCH)
diff --git a/include/ck/options.hpp.in b/include/ck/options.hpp.in
new file mode 100644
index 0000000000..87ed6026a4
--- /dev/null
+++ b/include/ck/options.hpp.in
@@ -0,0 +1,3 @@
+#pragma once
+
+#cmakedefine01 CK_TIME_KERNEL
diff --git a/include/ck/stream_config.hpp b/include/ck/stream_config.hpp
new file mode 100644
index 0000000000..3e80b4c892
--- /dev/null
+++ b/include/ck/stream_config.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+
+struct StreamConfig
+{
+    hipStream_t stream_id_ = nullptr;
+    bool time_kernel_      = false;
+};
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
index f1670d9c89..a989cb5297 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -7,6 +7,21 @@
 
 namespace ck {
 
+enum struct LoopScheduler
+{
+    Default,
+    Interwave,
+};
+
+constexpr LoopScheduler make_default_loop_scheduler()
+{
+#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+    return LoopScheduler::Interwave;
+#else
+    return LoopScheduler::Default;
+#endif // if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+}
+
 template <index_t BlockSize,
           typename FloatAB,
           typename FloatAcc,
@@ -302,7 +317,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
         });
     }
 
-    private:
+    protected:
     // A[M0, M1, M2, KPerThread]
     static constexpr auto a_thread_desc_ =
         make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
@@ -339,4 +354,232 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
     BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
 };
 
+// Note: To facilitate the inter-wave loop scheduler, we need to explicitly set the macro
+// CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=1 as a few intrinsics are not yet available in
+// the latest ROCm release. For unsupported compilers, inter-wave loop scheduler falls back to the
+// default loop scheduler which is given by the macro CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=0
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename AK0MK1BlockDesc,
+          typename BK0NK1BlockDesc,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS>
+struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
+    : public BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                 FloatAB,
+                                                                 FloatAcc,
+                                                                 AK0MK1BlockDesc,
+                                                                 BK0NK1BlockDesc,
+                                                                 MPerXDL,
+                                                                 NPerXDL,
+                                                                 MRepeat,
+                                                                 NRepeat,
+                                                                 KPack>
+{
+    using Base = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                     FloatAB,
+                                                                     FloatAcc,
+                                                                     AK0MK1BlockDesc,
+                                                                     BK0NK1BlockDesc,
+                                                                     MPerXDL,
+                                                                     NPerXDL,
+                                                                     MRepeat,
+                                                                     NRepeat,
+                                                                     KPack>;
+
+#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::A_K1;
+    using Base::b_block_desc_n0_n1_n2_k;
+    using Base::B_K1;
+    using Base::c_thread_buf_;
+    using Base::c_thread_desc_;
+    using Base::CalculateAThreadOriginDataIndex;
+    using Base::CalculateBThreadOriginDataIndex;
+    using Base::I0;
+    using Base::I1;
+    using Base::KPerThread;
+    using Base::xdlops_gemm;
+
+    static constexpr index_t KPerInnerLoop = math::max(KPerThread / NumMacClusters, KPack);
+
+    // 2-wave optimized blockwise gemm
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        static_for<0, KPerThread, KPerInnerLoop>{}([&](auto k) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                // read A
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                   make_tuple(m0, I0, I0, k),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, I0),
+                                   a_thread_buf);
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read B
+                b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                   make_tuple(n0, I0, I0, k),
+                                   b_block_buf,
+                                   b_thread_desc_,
+                                   make_tuple(n0, I0, I0, I0),
+                                   b_thread_buf);
+            });
+            __builtin_amdgcn_sched_barrier();
+            // NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except
+            // the first, as we can shorten non-MAC cluster a bit and there's no observable negative
+            // impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids
+            // some out-of-sync waves hijacking MAC resource from other workgroups and reducing the
+            // chance of latency hiding by waiting for the rest of the workgroup at the eventual
+            // sync point.
+            if constexpr(k.value != 0 || KPerInnerLoop == KPerThread)
+            {
+                asm volatile("s_barrier" ::);
+                __builtin_amdgcn_sched_barrier();
+            }
+            static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<FloatAB, KPack> a_thread_vec;
+                        vector_type<FloatAB, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto i) {
+                            a_thread_vec.template AsType<FloatAB>()(i) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, 0, 0, k_ + i))>{}];
+                            b_thread_vec.template AsType<FloatAB>()(i) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, 0, 0, k_ + i))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // The block_sync_lds() here performs double duty:
+                        // A) safeguard against data hazard because barrier from blockwise_gemm is
+                        // moved here B) reduce VMEM FIFO congestion by applying small delays to
+                        // different wavefronts It is performed near the end of MAC cluster to
+                        // minimize lgkmcnt penalty
+                        if constexpr(k.value == KPerThread - KPerInnerLoop &&
+                                     k_.value == KPerInnerLoop - KPack && m0.value == MRepeat - 1 &&
+                                     n0.value == NRepeat - 1)
+                        {
+                            __builtin_amdgcn_sched_barrier();
+                            block_sync_lds();
+                            __builtin_amdgcn_sched_barrier();
+                        }
+
+                        // TODO: insert setprio in more precise manner since we
+                        // could have more than >1 MFMA instructions in single call
+                        xdlops_gemm.template Run(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
+                        {
+                            __builtin_amdgcn_sched_barrier();
+                            __builtin_amdgcn_s_setprio(1);
+                            __builtin_amdgcn_sched_barrier();
+                        }
+                    });
+                });
+            });
+            __builtin_amdgcn_sched_barrier();
+            __builtin_amdgcn_s_setprio(0);
+            __builtin_amdgcn_sched_barrier();
+        });
+    }
+
+    protected:
+    // A[M0, M1, M2, KPerInnerLoop]
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KPerInnerLoop>{}));
+
+    // B[N0, N1, N2, KPerInnerLoop]
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, I1, Number<KPerInnerLoop>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerInnerLoop>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         decltype(b_block_desc_n0_n1_n2_k),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerInnerLoop>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
+    BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
+
+#endif // #if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+};
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename AK0MK1BlockDesc,
+          typename BK0NK1BlockDesc,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          LoopScheduler LoopSched>
+constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
+{
+    if constexpr(LoopSched == LoopScheduler::Default)
+    {
+        return BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                   FloatAB,
+                                                                   FloatAcc,
+                                                                   AK0MK1BlockDesc,
+                                                                   BK0NK1BlockDesc,
+                                                                   MPerXDL,
+                                                                   NPerXDL,
+                                                                   MRepeat,
+                                                                   NRepeat,
+                                                                   KPack>{};
+    }
+    else if constexpr(LoopSched == LoopScheduler::Interwave)
+    {
+        return BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                            FloatAB,
+                                                                            FloatAcc,
+                                                                            AK0MK1BlockDesc,
+                                                                            BK0NK1BlockDesc,
+                                                                            MPerXDL,
+                                                                            NPerXDL,
+                                                                            MRepeat,
+                                                                            NRepeat,
+                                                                            KPack>{};
+    }
+};
+
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
index cf48695ad0..950cfc1d61 100644
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -1,8 +1,9 @@
-#ifndef DEVICE_BASE_HPP
-#define DEVICE_BASE_HPP
+#pragma once
 
 #include <string>
 
+#include "stream_config.hpp"
+
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -22,7 +23,10 @@ struct BaseInvoker
     BaseInvoker(const BaseInvoker&) = default;
     BaseInvoker& operator=(const BaseInvoker&) = default;
 
-    virtual float Run(const BaseArgument*, int = 1) = 0;
+    virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{})
+    {
+        return float{0};
+    }
 
     virtual ~BaseInvoker() {}
 };
@@ -33,8 +37,8 @@ struct BaseOperator
     BaseOperator(const BaseOperator&) = default;
     BaseOperator& operator=(const BaseOperator&) = default;
 
-    virtual bool IsSupportedArgument(const BaseArgument*) = 0;
-    virtual std::string GetTypeString() const             = 0;
+    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
+    virtual std::string GetTypeString() const { return ""; }
 
     virtual ~BaseOperator() {}
 };
@@ -42,4 +46,3 @@ struct BaseOperator
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
index 92655b2755..a6408007ed 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -106,6 +106,9 @@ __global__ void
 #endif // end of if defined (defined(__gfx908__) || defined(__gfx90a__))
 }
 
+// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
+// version currently has compiler issues with register spill which further causes validation
+// failures.
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -154,7 +157,8 @@ template <typename ALayout,
           index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
           typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
           index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
-          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock>
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
 struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOperation,
                                                                       BElementwiseOperation,
                                                                       CElementwiseOperation,
@@ -600,7 +604,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
         CShuffleBlockTransferScalarPerVector_NPerBlock,
         CReduceThreadClusterLengths_MPerBlock_NPerBlock,
         CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
-        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock>;
+        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+        LoopSched>;
 
     using Block2CTileMap = decltype(MakeBlock2CTileMap(1, CGridDesc_M_N{}, 1, 1));
 
@@ -688,7 +693,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int /* nrepeat */ = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -724,6 +729,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
             const auto K =
                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
 
+            float elapsed_time = 0.0f;
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
                 const auto kernel = kernel_batched_gemm_reduce_xdl_cshuffle_v1<
@@ -743,26 +749,28 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                     remove_reference_t<Block2CTileMap>,
                     true>;
 
-                launch_kernel(kernel,
-                              dim3(grid_size),
-                              dim3(BlockSize),
-                              0,
-                              arg.p_a_grid_,
-                              arg.p_b_grid_,
-                              arg.p_c_grid_,
-                              arg.p_d0_grid_,
-                              arg.p_d1_grid_,
-                              arg.BatchCount_,
-                              arg.a_element_op_,
-                              arg.b_element_op_,
-                              arg.c_element_op_,
-                              arg.d1_element_op_,
-                              arg.a_grid_desc_ak0_m_ak1_,
-                              arg.b_grid_desc_bk0_n_bk1_,
-                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                              arg.d_grid_desc_mblock_mperblock_,
-                              arg.compute_base_ptr_of_batch_,
-                              arg.block_2_ctile_map_);
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_d0_grid_,
+                                           arg.p_d1_grid_,
+                                           arg.BatchCount_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.d1_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.d_grid_desc_mblock_mperblock_,
+                                           arg.compute_base_ptr_of_batch_,
+                                           arg.block_2_ctile_map_);
             }
             else
             {
@@ -783,35 +791,38 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                     remove_reference_t<Block2CTileMap>,
                     false>;
 
-                launch_kernel(kernel,
-                              dim3(grid_size),
-                              dim3(BlockSize),
-                              0,
-                              arg.p_a_grid_,
-                              arg.p_b_grid_,
-                              arg.p_c_grid_,
-                              arg.p_d0_grid_,
-                              arg.p_d1_grid_,
-                              arg.BatchCount_,
-                              arg.a_element_op_,
-                              arg.b_element_op_,
-                              arg.c_element_op_,
-                              arg.d1_element_op_,
-                              arg.a_grid_desc_ak0_m_ak1_,
-                              arg.b_grid_desc_bk0_n_bk1_,
-                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                              arg.d_grid_desc_mblock_mperblock_,
-                              arg.compute_base_ptr_of_batch_,
-                              arg.block_2_ctile_map_);
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_d0_grid_,
+                                           arg.p_d1_grid_,
+                                           arg.BatchCount_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.d1_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.d_grid_desc_mblock_mperblock_,
+                                           arg.compute_base_ptr_of_batch_,
+                                           arg.block_2_ctile_map_);
             }
 
-            return 0;
+            return elapsed_time;
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
index 88974a5221..ea7704951e 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
@@ -428,7 +428,7 @@ struct DeviceBatchedGemmXdl
     {
         using Argument = DeviceBatchedGemmXdl::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             {
                 std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -477,8 +477,8 @@ struct DeviceBatchedGemmXdl
                     remove_reference_t<Block2CTileMap>,
                     true>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -511,8 +511,8 @@ struct DeviceBatchedGemmXdl
                     remove_reference_t<Block2CTileMap>,
                     false>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -534,9 +534,10 @@ struct DeviceBatchedGemmXdl
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
index 2643e46ff2..1f6ebc7042 100644
--- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -55,7 +55,8 @@ template <typename ALayout,
           index_t CShuffleMXdlPerWavePerShuffle,
           index_t CShuffleNXdlPerWavePerShuffle,
           typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
 struct DeviceCGemm_4Gemm_Xdl_CShuffle
     : public DeviceCGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
 {
@@ -376,7 +377,8 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
         CShuffleMXdlPerWavePerShuffle,
         CShuffleNXdlPerWavePerShuffle,
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CShuffleBlockTransferScalarPerVector_NPerBlock>;
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -448,7 +450,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             if(!GridwiseGemm::CheckValidity(
                    arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.c_grid_desc_m_n_))
@@ -478,146 +480,77 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     true>;
 
-                if(nrepeat == 0)
-                {
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_real_,
-                                  arg.p_b_grid_real_,
-                                  arg.p_c_grid_real_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_real_,
+                                           arg.p_b_grid_real_,
+                                           arg.p_c_grid_real_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
 
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_imag_,
-                                  arg.p_b_grid_imag_,
-                                  arg.p_aux_grid_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_imag_,
+                                           arg.p_b_grid_imag_,
+                                           arg.p_aux_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
 
-                    // c_real = c_real - aux needed here!!!
+                // c_real = c_real - aux needed here!!!
 
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_real_,
-                                  arg.p_b_grid_imag_,
-                                  arg.p_c_grid_imag_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_real_,
+                                           arg.p_b_grid_imag_,
+                                           arg.p_c_grid_imag_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
 
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_imag_,
-                                  arg.p_b_grid_real_,
-                                  arg.p_aux_grid_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_imag_,
+                                           arg.p_b_grid_real_,
+                                           arg.p_aux_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
 
-                    // c_imag = c_imag + aux needed here!!!
-                }
-                else
-                {
-                    ave_time +=
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_real_,
-                                               arg.p_b_grid_real_,
-                                               arg.p_c_grid_real_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-
-                    ave_time +=
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_imag_,
-                                               arg.p_b_grid_imag_,
-                                               arg.p_aux_grid_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-
-                    // // c_real = c_real - aux needed here!!!
-
-                    ave_time +=
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_real_,
-                                               arg.p_b_grid_imag_,
-                                               arg.p_c_grid_imag_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-
-                    ave_time +=
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_imag_,
-                                               arg.p_b_grid_real_,
-                                               arg.p_aux_grid_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-
-                    // c_imag = c_imag + aux needed here!!!
-                }
+                // c_imag = c_imag + aux needed here!!!
             }
             else
             {
@@ -634,155 +567,87 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     false>;
 
-                if(nrepeat == 0)
-                {
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_real_,
-                                  arg.p_b_grid_real_,
-                                  arg.p_c_grid_real_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_real_,
+                                           arg.p_b_grid_real_,
+                                           arg.p_c_grid_real_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
 
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_imag_,
-                                  arg.p_b_grid_imag_,
-                                  arg.p_aux_grid_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_imag_,
+                                           arg.p_b_grid_imag_,
+                                           arg.p_aux_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
 
-                    // // c_real = c_real - aux needed here!!!
+                // // c_real = c_real - aux needed here!!!
 
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_real_,
-                                  arg.p_b_grid_imag_,
-                                  arg.p_c_grid_imag_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_real_,
+                                           arg.p_b_grid_imag_,
+                                           arg.p_c_grid_imag_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
 
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_imag_,
-                                  arg.p_b_grid_real_,
-                                  arg.p_aux_grid_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_imag_,
+                                           arg.p_b_grid_real_,
+                                           arg.p_aux_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
 
-                    // c_imag = c_imag + aux needed here!!!
-                }
-                else
-                {
-                    ave_time +=
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_real_,
-                                               arg.p_b_grid_real_,
-                                               arg.p_c_grid_real_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-
-                    ave_time +=
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_imag_,
-                                               arg.p_b_grid_imag_,
-                                               arg.p_aux_grid_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-
-                    // c_real = c_real - aux needed here!!!
-
-                    ave_time +=
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_real_,
-                                               arg.p_b_grid_imag_,
-                                               arg.p_c_grid_imag_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-
-                    ave_time +=
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_imag_,
-                                               arg.p_b_grid_real_,
-                                               arg.p_aux_grid_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-
-                    // c_imag = c_imag + aux needed here!!!
-                }
+                // c_imag = c_imag + aux needed here!!!
             }
 
             return ave_time;
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index 466e6ad89f..c36227083c 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -415,9 +415,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                       << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
         }
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             ShowInfo(arg);
+
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
                                             arg.b_grid_desc_kbatch_k0_n_k1_,
                                             arg.c_grid_desc_m_n_,
@@ -437,49 +438,27 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
             float ave_time = 0;
 
             const auto Run = [&](const auto& kernel) {
-                if(nrepeat > 0)
-                {
-                    ave_time =
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_,
-                                               arg.p_b_grid_,
-                                               arg.p_c_grid_,
-                                               arg.a_grid_desc_kbatch_k0_m_k1_,
-                                               arg.b_grid_desc_kbatch_k0_n_k1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.block_2_ctile_map_);
-                }
+                hipGetErrorString(hipMemset(
+                    arg.p_c_grid_,
+                    0,
+                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+                        sizeof(CDataType)));
 
-                if(kbatch > 1 || nrepeat <= 0)
-                {
-                    hipGetErrorString(hipMemset(
-                        arg.p_c_grid_,
-                        0,
-                        arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
-                            sizeof(CDataType)));
-
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_,
-                                  arg.p_b_grid_,
-                                  arg.p_c_grid_,
-                                  arg.a_grid_desc_kbatch_k0_m_k1_,
-                                  arg.b_grid_desc_kbatch_k0_n_k1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.block_2_ctile_map_);
-                }
+                launch_and_time_kernel(stream_config,
+                                       kernel,
+                                       dim3(grid_size),
+                                       dim3(BlockSize),
+                                       0,
+                                       arg.p_a_grid_,
+                                       arg.p_b_grid_,
+                                       arg.p_c_grid_,
+                                       arg.a_grid_desc_kbatch_k0_m_k1_,
+                                       arg.b_grid_desc_kbatch_k0_n_k1_,
+                                       arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                       arg.a_element_op_,
+                                       arg.b_element_op_,
+                                       arg.c_element_op_,
+                                       arg.block_2_ctile_map_);
             };
 
             if(has_main_k0_block_loop)
@@ -560,9 +539,10 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
index fad4ec1ffa..def6af74ac 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -531,7 +531,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
             for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
@@ -602,8 +602,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         true>;
 
                     ave_time += launch_and_time_kernel(
+                        stream_config,
                         kernel,
-                        nrepeat,
                         dim3(grid_size),
                         dim3(BlockSize),
                         0,
@@ -635,8 +635,8 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                         false>;
 
                     ave_time += launch_and_time_kernel(
+                        stream_config,
                         kernel,
-                        nrepeat,
                         dim3(grid_size),
                         dim3(BlockSize),
                         0,
@@ -655,9 +655,10 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
index 6648929cd5..fd95c184ca 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -642,7 +642,7 @@ struct
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -727,8 +727,8 @@ struct
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -771,8 +771,8 @@ struct
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -795,9 +795,10 @@ struct
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
index fd0941420c..61c91c0b76 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -605,7 +605,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -684,8 +684,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -723,8 +723,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -745,9 +745,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
index b508606a75..f4cddc1946 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -568,7 +568,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -663,8 +663,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -697,8 +697,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -717,9 +717,10 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 3574f7667e..aa9229f7cb 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -450,7 +450,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -498,8 +498,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     true>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -529,8 +529,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     false>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -549,9 +549,10 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
index c3ebe58865..b1eea0b33f 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <memory>
 #include <sstream>
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "device.hpp"
 #include "device_conv_fwd.hpp"
 #include "common_header.hpp"
@@ -92,7 +92,7 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto naive_conv3d_fwd =
                 ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
@@ -103,8 +103,8 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
                                                       WeiElementwiseOperation,
                                                       OutElementwiseOperation>;
 
-            float ave_time = launch_and_time_kernel(naive_conv3d_fwd,
-                                                    nrepeat,
+            float ave_time = launch_and_time_kernel(stream_config,
+                                                    naive_conv3d_fwd,
                                                     dim3(256),
                                                     dim3(256),
                                                     0,
@@ -137,9 +137,10 @@ struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_W
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
index ff30a6880d..0f98ba054d 100644
--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -438,7 +438,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             {
                 std::cout << "num_batches_of_GEMM = " << arg.num_subbatches_ << std::endl;
@@ -487,8 +487,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
                     OutElementwiseOperation,
                     remove_reference_t<Block2CTileMap>,
                     true>;
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -522,8 +522,8 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
                     remove_reference_t<Block2CTileMap>,
                     false>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -547,9 +547,10 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
index 5dca8f9629..209b3c866e 100644
--- a/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -1241,7 +1241,7 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             float ave_time = 0;
             for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
@@ -1316,8 +1316,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
                         true>;
 
                     ave_time += launch_and_time_kernel(
+                        stream_config,
                         kernel,
-                        nrepeat,
                         dim3(grid_size),
                         dim3(BlockSize),
                         0,
@@ -1349,8 +1349,8 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
                         false>;
 
                     ave_time += launch_and_time_kernel(
+                        stream_config,
                         kernel,
-                        nrepeat,
                         dim3(grid_size),
                         dim3(BlockSize),
                         0,
@@ -1369,9 +1369,10 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
index 7365f9a3e2..4251052a99 100644
--- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -747,7 +747,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -795,8 +795,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     true>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -826,8 +826,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     false>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -846,9 +846,10 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
             return ave_time;
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
index 1a3fbdf956..69c29b72d3 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
@@ -14,6 +14,9 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 
+// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
+// version currently has compiler issues with register spill which further causes validation
+// failures.
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -62,7 +65,8 @@ template <typename ALayout,
           index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
           typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
           index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
-          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock>
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
 struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOperation,
                                                                BElementwiseOperation,
                                                                CElementwiseOperation,
@@ -422,7 +426,8 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
         CShuffleBlockTransferScalarPerVector_NPerBlock,
         CReduceThreadClusterLengths_MPerBlock_NPerBlock,
         CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
-        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock>;
+        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+        LoopSched>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -498,7 +503,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int /* nrepeat */ = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -531,6 +536,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
             const auto K =
                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
 
+            float elapsed_time = 0.0f;
             if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
             {
                 const auto kernel = kernel_gemm_reduce_xdl_cshuffle_v1<
@@ -549,24 +555,26 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     true>;
 
-                launch_kernel(kernel,
-                              dim3(grid_size),
-                              dim3(BlockSize),
-                              0,
-                              arg.p_a_grid_,
-                              arg.p_b_grid_,
-                              arg.p_c_grid_,
-                              arg.p_d0_grid_,
-                              arg.p_d1_grid_,
-                              arg.a_element_op_,
-                              arg.b_element_op_,
-                              arg.c_element_op_,
-                              arg.d1_element_op_,
-                              arg.a_grid_desc_ak0_m_ak1_,
-                              arg.b_grid_desc_bk0_n_bk1_,
-                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                              arg.d_grid_desc_mblock_mperblock_,
-                              arg.block_2_ctile_map_);
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_d0_grid_,
+                                           arg.p_d1_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.d1_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.d_grid_desc_mblock_mperblock_,
+                                           arg.block_2_ctile_map_);
             }
             else
             {
@@ -586,33 +594,36 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     false>;
 
-                launch_kernel(kernel,
-                              dim3(grid_size),
-                              dim3(BlockSize),
-                              0,
-                              arg.p_a_grid_,
-                              arg.p_b_grid_,
-                              arg.p_c_grid_,
-                              arg.p_d0_grid_,
-                              arg.p_d1_grid_,
-                              arg.a_element_op_,
-                              arg.b_element_op_,
-                              arg.c_element_op_,
-                              arg.d1_element_op_,
-                              arg.a_grid_desc_ak0_m_ak1_,
-                              arg.b_grid_desc_bk0_n_bk1_,
-                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                              arg.d_grid_desc_mblock_mperblock_,
-                              arg.block_2_ctile_map_);
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_d0_grid_,
+                                           arg.p_d1_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.d1_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.d_grid_desc_mblock_mperblock_,
+                                           arg.block_2_ctile_map_);
             }
 
-            return 0;
+            return elapsed_time;
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
index 47997cd802..2bb7f6e78a 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
@@ -290,7 +290,7 @@ struct DeviceGemmXdl
     {
         using Argument = DeviceGemmXdl::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -339,8 +339,8 @@ struct DeviceGemmXdl
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     true>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -370,8 +370,8 @@ struct DeviceGemmXdl
                     remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
                     false>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -391,9 +391,10 @@ struct DeviceGemmXdl
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp
index 4010965312..315f39d9bf 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp
@@ -264,7 +264,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
     {
         using Argument = DeviceGemmXdl_C_Shuffle_Bias_2d::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             {
                 std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -320,8 +320,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -359,8 +359,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -382,9 +382,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_2d
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
index c65ff6022a..f1f9f41724 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation.hpp
@@ -273,7 +273,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             {
                 std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -329,8 +329,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -368,8 +368,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -391,9 +391,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
index 4a478c995d..e3d0986aba 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_activation_add.hpp
@@ -312,7 +312,7 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             {
                 std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
@@ -374,8 +374,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
                     true>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -418,8 +418,8 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
                     false>;
 
                 ave_time = launch_and_time_kernel(
+                    stream_config,
                     kernel,
-                    nrepeat,
                     dim3(grid_size),
                     dim3(BlockSize),
                     0,
@@ -443,9 +443,10 @@ struct DeviceGemmXdl_C_Shuffle_Bias_Activation_Add
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
index 440519537e..952630120a 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
@@ -14,6 +14,9 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 
+// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
+// version currently has compiler issues with register spill which further causes validation
+// failures.
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -54,7 +57,8 @@ template <typename ALayout,
           index_t CShuffleMXdlPerWavePerShuffle,
           index_t CShuffleNXdlPerWavePerShuffle,
           typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
 struct DeviceGemm_Xdl_CShuffle
     : public DeviceGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
 {
@@ -375,7 +379,8 @@ struct DeviceGemm_Xdl_CShuffle
         CShuffleMXdlPerWavePerShuffle,
         CShuffleNXdlPerWavePerShuffle,
         CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        CShuffleBlockTransferScalarPerVector_NPerBlock>;
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
 
     // Argument
     struct Argument : public BaseArgument
@@ -435,7 +440,7 @@ struct DeviceGemm_Xdl_CShuffle
     {
         using Argument = DeviceOp::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
 #if 0
             {
@@ -482,42 +487,22 @@ struct DeviceGemm_Xdl_CShuffle
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     true>;
 
-                if(nrepeat == 0)
-                {
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_,
-                                  arg.p_b_grid_,
-                                  arg.p_c_grid_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
-                }
-                else
-                {
-                    ave_time =
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_,
-                                               arg.p_b_grid_,
-                                               arg.p_c_grid_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-                }
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
             }
             else
             {
@@ -533,52 +518,32 @@ struct DeviceGemm_Xdl_CShuffle
                     typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
                     typename GridwiseGemm::DefaultBlock2CTileMap,
                     false>;
-
-                if(nrepeat == 0)
-                {
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_,
-                                  arg.p_b_grid_,
-                                  arg.p_c_grid_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.a_grid_desc_ak0_m_ak1_,
-                                  arg.b_grid_desc_bk0_n_bk1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.block_2_ctile_map_);
-                }
-                else
-                {
-                    ave_time =
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_,
-                                               arg.p_b_grid_,
-                                               arg.p_c_grid_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.a_grid_desc_ak0_m_ak1_,
-                                               arg.b_grid_desc_bk0_n_bk1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.block_2_ctile_map_);
-                }
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
             }
 
             return ave_time;
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
index db6c884739..e603af1fba 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
@@ -385,8 +385,11 @@ struct DeviceGemmXdlSplitK
             std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
                       << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
         }
-        float Run(const Argument& arg, int nrepeat = 1)
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
+            ShowInfo(arg);
+
             const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
 
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
@@ -408,50 +411,30 @@ struct DeviceGemmXdlSplitK
             float ave_time = 0;
 
             const auto Run = [&](const auto& kernel) {
-                if(nrepeat > 0)
-                {
-                    ShowInfo(arg);
-                    ave_time = launch_and_time_kernel(kernel,
-                                                      nrepeat,
-                                                      dim3(grid_size),
-                                                      dim3(BlockSize),
-                                                      0,
-                                                      arg.p_a_grid_,
-                                                      arg.p_b_grid_,
-                                                      arg.p_c_grid_,
-                                                      arg.a_grid_desc_kbatch_k0_m_k1_,
-                                                      arg.b_grid_desc_kbatch_k0_n_k1_,
-                                                      arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                                      arg.a_element_op_,
-                                                      arg.b_element_op_,
-                                                      arg.c_element_op_,
-                                                      arg.block_2_ctile_map_);
-                }
+                // FIXME: this should be moved outside of DeviceOp
+                hipGetErrorString(
+                    hipMemset(arg.p_c_grid_,
+                              0,
+                              arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() *
+                                  sizeof(CDataType)));
 
-                if(kbatch > 1 || nrepeat <= 0)
-                {
-                    hipGetErrorString(
-                        hipMemset(arg.p_c_grid_,
-                                  0,
-                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_.GetElementSpaceSize() *
-                                      sizeof(CDataType)));
-
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_,
-                                  arg.p_b_grid_,
-                                  arg.p_c_grid_,
-                                  arg.a_grid_desc_kbatch_k0_m_k1_,
-                                  arg.b_grid_desc_kbatch_k0_n_k1_,
-                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.block_2_ctile_map_);
-                }
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_kbatch_k0_m_k1_,
+                                                  arg.b_grid_desc_kbatch_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.block_2_ctile_map_);
             };
+
             if(has_main_k0_block_loop)
             {
                 if(kbatch == 1)
@@ -531,9 +514,10 @@ struct DeviceGemmXdlSplitK
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
index 9de5361ab6..7d00224429 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -391,8 +391,11 @@ struct DeviceGemmXdlSplitKCShuffle
             std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
                       << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
         }
-        float Run(const Argument& arg, int nrepeat = 1)
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
+            ShowInfo(arg);
+
             const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
 
             if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
@@ -414,51 +417,29 @@ struct DeviceGemmXdlSplitKCShuffle
             float ave_time = 0;
 
             const auto Run = [&](const auto& kernel) {
-                if(nrepeat > 0)
-                {
-                    ShowInfo(arg);
-                    ave_time =
-                        launch_and_time_kernel(kernel,
-                                               nrepeat,
-                                               dim3(grid_size),
-                                               dim3(BlockSize),
-                                               0,
-                                               arg.p_a_grid_,
-                                               arg.p_b_grid_,
-                                               arg.p_c_grid_,
-                                               arg.a_grid_desc_kbatch_k0_m_k1_,
-                                               arg.b_grid_desc_kbatch_k0_n_k1_,
-                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                               arg.a_element_op_,
-                                               arg.b_element_op_,
-                                               arg.c_element_op_,
-                                               arg.block_2_ctile_map_);
-                }
+                hipGetErrorString(hipMemset(
+                    arg.p_c_grid_,
+                    0,
+                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+                        sizeof(CDataType)));
 
-                if(kbatch > 1 || nrepeat <= 0)
-                {
-                    hipGetErrorString(hipMemset(
-                        arg.p_c_grid_,
-                        0,
-                        arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
-                            sizeof(CDataType)));
-
-                    launch_kernel(kernel,
-                                  dim3(grid_size),
-                                  dim3(BlockSize),
-                                  0,
-                                  arg.p_a_grid_,
-                                  arg.p_b_grid_,
-                                  arg.p_c_grid_,
-                                  arg.a_grid_desc_kbatch_k0_m_k1_,
-                                  arg.b_grid_desc_kbatch_k0_n_k1_,
-                                  arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                  arg.a_element_op_,
-                                  arg.b_element_op_,
-                                  arg.c_element_op_,
-                                  arg.block_2_ctile_map_);
-                }
+                launch_and_time_kernel(stream_config,
+                                       kernel,
+                                       dim3(grid_size),
+                                       dim3(BlockSize),
+                                       0,
+                                       arg.p_a_grid_,
+                                       arg.p_b_grid_,
+                                       arg.p_c_grid_,
+                                       arg.a_grid_desc_kbatch_k0_m_k1_,
+                                       arg.b_grid_desc_kbatch_k0_n_k1_,
+                                       arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                       arg.a_element_op_,
+                                       arg.b_element_op_,
+                                       arg.c_element_op_,
+                                       arg.block_2_ctile_map_);
             };
+
             if(has_main_k0_block_loop)
             {
                 if(kbatch == 1)
@@ -542,9 +523,10 @@ struct DeviceGemmXdlSplitKCShuffle
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
index dfc1ce2715..730b2d787e 100644
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
@@ -449,7 +449,7 @@ struct DeviceGroupedGemmXdl
     {
         using Argument = DeviceGroupedGemmXdl::Argument;
 
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             StaticallyIndexedArray<GemmDescKernelArg, MaxGroupCount> gemm_desc_kernel_args;
 
@@ -510,8 +510,8 @@ struct DeviceGroupedGemmXdl
                                                     true,
                                                     MaxGroupCount>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(arg.grid_size_),
                                                   dim3(BlockSize),
                                                   0,
@@ -534,8 +534,8 @@ struct DeviceGroupedGemmXdl
                                                     false,
                                                     MaxGroupCount>;
 
-                ave_time = launch_and_time_kernel(kernel,
-                                                  nrepeat,
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
                                                   dim3(arg.grid_size_),
                                                   dim3(BlockSize),
                                                   0,
@@ -550,9 +550,10 @@ struct DeviceGroupedGemmXdl
         }
 
         // polymorphic
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
index 651d31ae2f..f665378e08 100644
--- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -204,7 +204,7 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
                                                                          OutDataType,
@@ -241,8 +241,8 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
 
             const index_t grid_size = (ReduceM / ReduceM_BlockTileSize);
 
-            return launch_and_time_kernel(kernel,
-                                          nrepeat,
+            return launch_and_time_kernel(stream_config,
+                                          kernel,
                                           dim3(grid_size),
                                           dim3(BlockSize),
                                           0,
@@ -257,9 +257,10 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
                                           arg.p_out_indices_dev_);
         }
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
index 4f17989b53..860f53d8c5 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
@@ -211,7 +211,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto in_grid_desc_m_k =
                 DeviceReduceBlockWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
@@ -253,8 +253,8 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
                                                         InElementwiseOperation,
                                                         AccElementwiseOperation>;
 
-            avg_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
+            avg_time = launch_and_time_kernel(stream_config,
+                                              kernel,
                                               dim3(arg.gridSize),
                                               dim3(BlockSize),
                                               0,
@@ -272,9 +272,10 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
             return (avg_time);
         };
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         };
     };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
index d3b1b4b5c3..43ac48cecc 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
@@ -182,7 +182,7 @@ struct DeviceReduceBlockWiseSecondCall
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor(
                 arg.inLengths_, arg.inStrides_);
@@ -224,8 +224,8 @@ struct DeviceReduceBlockWiseSecondCall
                                                                     InElementwiseOperation,
                                                                     AccElementwiseOperation>;
 
-            avg_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
+            avg_time = launch_and_time_kernel(stream_config,
+                                              kernel,
                                               dim3(arg.gridSize),
                                               dim3(BlockSize),
                                               0,
@@ -243,10 +243,11 @@ struct DeviceReduceBlockWiseSecondCall
             return (avg_time);
         };
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
-        };
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
     };
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
index 889c366875..f93c65fe18 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
@@ -245,7 +245,7 @@ struct DeviceReduceMultiBlockAtomicAdd
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto in_grid_desc_m_k = DeviceReduceMultiBlockAtomicAdd::MakeSrc2dDescriptor(
                 arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
@@ -275,8 +275,6 @@ struct DeviceReduceMultiBlockAtomicAdd
 
             float avg_time = 0;
 
-            KernelTimer timer;
-
             const auto kernel_pre  = kernel_buffer_set_value<BlockSize, OutDataType, OutGridDesc_M>;
             const auto kernel_main = kernel_reduce_multiblock_atocmi_add<GridwiseReduce,
                                                                          InDataType,
@@ -287,50 +285,38 @@ struct DeviceReduceMultiBlockAtomicAdd
                                                                          InElementwiseOperation,
                                                                          AccElementwiseOperation>;
 
-            printf("launch_and_time_kernel: grid_dim {%ld, 1, 1}, block_dim {%d, 1, 1} \n",
-                   arg.gridSize,
-                   BlockSize);
-            printf("Warm up\n");
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_pre,
+                                               dim3(arg.gridSize_pre),
+                                               dim3(BlockSize),
+                                               0,
+                                               out_grid_desc_m,
+                                               arg.out_dev_,
+                                               static_cast<OutDataType>(0.0f));
 
-            for(int i = 0; i < nrepeat + 1; i++)
-            {
-                if(i == 1)
-                    timer.Start();
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize),
+                                               dim3(BlockSize),
+                                               0,
+                                               in_grid_desc_m_k,
+                                               out_grid_desc_m,
+                                               arg.in_elementwise_op_,
+                                               arg.acc_elementwise_op_,
+                                               arg.blkGroupSize,
+                                               arg.kBlockTileIterations,
+                                               arg.alpha_,
+                                               arg.in_dev_,
+                                               arg.out_dev_);
 
-                launch_kernel(kernel_pre,
-                              dim3(arg.gridSize_pre),
-                              dim3(BlockSize),
-                              0,
-                              out_grid_desc_m,
-                              arg.out_dev_,
-                              static_cast<OutDataType>(0.0f));
+            return avg_time;
+        }
 
-                launch_kernel(kernel_main,
-                              dim3(arg.gridSize),
-                              dim3(BlockSize),
-                              0,
-                              in_grid_desc_m_k,
-                              out_grid_desc_m,
-                              arg.in_elementwise_op_,
-                              arg.acc_elementwise_op_,
-                              arg.blkGroupSize,
-                              arg.kBlockTileIterations,
-                              arg.alpha_,
-                              arg.in_dev_,
-                              arg.out_dev_);
-            };
-
-            timer.End();
-
-            avg_time = timer.GetElapsedTime() / nrepeat;
-
-            return (avg_time);
-        };
-
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
-        };
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
     };
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
index d583f7f1b8..b4eb8116c2 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
@@ -273,7 +273,7 @@ struct DeviceReduceMultiBlockPartialReduce
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor(
                 arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
@@ -313,8 +313,8 @@ struct DeviceReduceMultiBlockPartialReduce
                                                                  InElementwiseOperation,
                                                                  AccElementwiseOperation>;
 
-            avg_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
+            avg_time = launch_and_time_kernel(stream_config,
+                                              kernel,
                                               dim3(arg.gridSize),
                                               dim3(BlockSize),
                                               0,
@@ -331,10 +331,11 @@ struct DeviceReduceMultiBlockPartialReduce
             return (avg_time);
         };
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
-        };
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
     };
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
index bf4088a96b..dacb175043 100644
--- a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
@@ -212,7 +212,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
 
     struct Invoker : public BaseInvoker
     {
-        float Run(const Argument& arg, int nrepeat = 1)
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
             const auto in_grid_desc_m_k =
                 DeviceReduceThreadWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
@@ -254,8 +254,8 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
                                                          InElementwiseOperation,
                                                          OutElementwiseOperation>;
 
-            avg_time = launch_and_time_kernel(kernel,
-                                              nrepeat,
+            avg_time = launch_and_time_kernel(stream_config,
+                                              kernel,
                                               dim3(arg.gridSize),
                                               dim3(BlockSize),
                                               0,
@@ -272,10 +272,11 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
             return (avg_time);
         };
 
-        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
         {
-            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
-        };
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
     };
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
index 6a1b6eef31..20c3a0b618 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include "common_header.hpp"
+#include "tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
 
 namespace ck {
 
@@ -248,4 +249,116 @@ struct GridwiseGemmPipeline_v1<2>
     }
 };
 
+template <index_t NumPrefetch>
+struct GridwiseGemmPipelineInterwave_v1;
+
+template <>
+struct GridwiseGemmPipelineInterwave_v1<1>
+{
+    __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; }
+
+    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
+    {
+        return num_loop > 1;
+    }
+
+    template <bool HasMainLoop,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename BlockwiseGemm,
+              typename CThreadBuffer>
+    static __device__ void Run(const AGridDesc& a_grid_desc,
+                               const ABlockDesc& a_block_desc,
+                               ABlockTransfer& a_blockwise_copy,
+                               const AGridBuffer& a_grid_buf,
+                               ABlockBuffer& a_block_buf,
+                               const ABlockTransferStep& a_block_copy_step,
+                               const BGridDesc& b_grid_desc,
+                               const BBlockDesc& b_block_desc,
+                               BBlockTransfer& b_blockwise_copy,
+                               const BGridBuffer& b_grid_buf,
+                               BBlockBuffer& b_block_buf,
+                               const BBlockTransferStep& b_block_copy_step,
+                               const BlockwiseGemm& blockwise_gemm,
+                               CThreadBuffer& c_thread_buf,
+                               index_t num_loop)
+    {
+        // preload data into LDS
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+
+            do
+            {
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+
+                block_sync_lds();
+
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+                // block_sync_lds(); // moved into blockwise_gemm
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                ++i;
+            } while(i < (num_loop - 1));
+        }
+
+        // tail
+        {
+            block_sync_lds();
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+    }
+};
+
+// Note: 2 stage prefetch not optimized for inter-wave loop scheduler
+template <>
+struct GridwiseGemmPipelineInterwave_v1<2> : public GridwiseGemmPipeline_v1<2>
+{
+};
+
+template <index_t NumPrefetch, LoopScheduler LoopSched>
+constexpr auto GridwiseGemmPipeline_v1_Selector()
+{
+    if constexpr(LoopSched == LoopScheduler::Default)
+    {
+        return GridwiseGemmPipeline_v1<NumPrefetch>{};
+    }
+    else if constexpr(LoopSched == LoopScheduler::Interwave)
+    {
+        return GridwiseGemmPipelineInterwave_v1<NumPrefetch>{};
+    }
+}
+
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
index 4e2e279ef3..cf98ea8043 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -134,7 +134,8 @@ template <typename FloatAB,
           index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
           typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
           index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
-          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock>
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched>
 struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 {
     static constexpr auto I0 = Number<0>{};
@@ -473,17 +474,18 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         constexpr index_t KPack = math::max(
             math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
 
-        auto blockwise_gemm =
-            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                FloatAB,
-                                                                FloatGemmAcc,
-                                                                decltype(a_block_desc_ak0_m_ak1),
-                                                                decltype(b_block_desc_bk0_n_bk1),
-                                                                MPerXdl,
-                                                                NPerXdl,
-                                                                MXdlPerWave,
-                                                                NXdlPerWave,
-                                                                KPack>{};
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
 
         auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
 
@@ -502,25 +504,28 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
 
         // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_v1_Selector<NumGemmKPrefetchStage, LoopSched>();
+
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
             (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
             KPerBlock);
 
-        GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
-                                                          a_block_desc_ak0_m_ak1,
-                                                          a_blockwise_copy,
-                                                          a_grid_buf,
-                                                          a_block_buf,
-                                                          a_block_slice_copy_step,
-                                                          b_grid_desc_bk0_n_bk1,
-                                                          b_block_desc_bk0_n_bk1,
-                                                          b_blockwise_copy,
-                                                          b_grid_buf,
-                                                          b_block_buf,
-                                                          b_block_slice_copy_step,
-                                                          blockwise_gemm,
-                                                          c_thread_buf,
-                                                          num_k_block_main_loop);
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                               a_block_desc_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_grid_desc_bk0_n_bk1,
+                                                               b_block_desc_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
 
         // shuffle C and write out
         {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
index b28907b43e..f0eabf9de6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -107,7 +107,8 @@ template <typename FloatAB,
           index_t CShuffleMXdlPerWavePerShuffle,
           index_t CShuffleNXdlPerWavePerShuffle,
           typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched>
 struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
 {
     static constexpr auto I0 = Number<0>{};
@@ -416,17 +417,18 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         constexpr index_t KPack = math::max(
             math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
 
-        auto blockwise_gemm =
-            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
-                                                                FloatAB,
-                                                                FloatGemmAcc,
-                                                                decltype(a_block_desc_ak0_m_ak1),
-                                                                decltype(b_block_desc_bk0_n_bk1),
-                                                                MPerXdl,
-                                                                NPerXdl,
-                                                                MXdlPerWave,
-                                                                NXdlPerWave,
-                                                                KPack>{};
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
 
         auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
 
@@ -445,25 +447,28 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
         constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
 
         // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_v1_Selector<NumGemmKPrefetchStage, LoopSched>();
+
         const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
             (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
             KPerBlock);
 
-        GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
-                                                          a_block_desc_ak0_m_ak1,
-                                                          a_blockwise_copy,
-                                                          a_grid_buf,
-                                                          a_block_buf,
-                                                          a_block_slice_copy_step,
-                                                          b_grid_desc_bk0_n_bk1,
-                                                          b_block_desc_bk0_n_bk1,
-                                                          b_blockwise_copy,
-                                                          b_grid_buf,
-                                                          b_block_buf,
-                                                          b_block_slice_copy_step,
-                                                          blockwise_gemm,
-                                                          c_thread_buf,
-                                                          num_k_block_main_loop);
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                               a_block_desc_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_grid_desc_bk0_n_bk1,
+                                                               b_block_desc_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
 
         // shuffle C and write out
         {
diff --git a/library/include/ck/library/host/host_interface.hpp b/library/include/ck/library/host/host_interface.hpp
new file mode 100644
index 0000000000..955da0f4be
--- /dev/null
+++ b/library/include/ck/library/host/host_interface.hpp
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "stream_config.hpp"
+#include "config.hpp"
+#include "device_base.hpp"
+
+struct DeviceConvFwdPtr_t
+{
+    using BaseArgument = ck::tensor_operation::device::BaseArgument;
+    using BaseInvoker  = ck::tensor_operation::device::BaseInvoker;
+
+    struct DeviceConvFwdPtrImpl;
+    std::unique_ptr<DeviceConvFwdPtrImpl> pImpl;
+    DeviceConvFwdPtr_t();
+    ~DeviceConvFwdPtr_t();
+    DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&);
+    DeviceConvFwdPtr_t(DeviceConvFwdPtrImpl&);
+    DeviceConvFwdPtr_t& operator=(DeviceConvFwdPtr_t&) = delete;
+    DeviceConvFwdPtr_t& operator=(const DeviceConvFwdPtr_t&) = delete;
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(void* in_ptr,
+                        void* wei_ptr,
+                        void* out_ptr,
+                        size_t N,
+                        size_t K,
+                        size_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads)
+        const; // in,wei and out element ops are ignored for now since even if we change them, they
+               // cant be linked
+    std::unique_ptr<BaseInvoker>
+    MakeInvokerPointer() const; // requires including BaseInvoker headers
+    std::string GetTypeString();
+    bool IsSupportedArgument(const BaseArgument* arg_ptr);
+};
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances);
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances);
diff --git a/library/include/ck/library/host_tensor/device.hpp b/library/include/ck/library/host_tensor/device.hpp
index f33b8d4f40..d549b14c8c 100644
--- a/library/include/ck/library/host_tensor/device.hpp
+++ b/library/include/ck/library/host_tensor/device.hpp
@@ -1,12 +1,25 @@
-#ifndef DEVICE_HPP
-#define DEVICE_HPP
+#pragma once
 
 #include <memory>
 #include <functional>
 #include <thread>
 #include <chrono>
-#include "hip/hip_runtime.h"
-#include "hip/hip_fp16.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+
+#include "stream_config.hpp"
+#include "ck/options.hpp"
+
+inline void hip_check_error(hipError_t x)
+{
+    if(x != hipSuccess)
+    {
+        std::ostringstream ss;
+        ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
+           << "in function: " << __func__;
+        throw std::runtime_error(ss.str());
+    }
+}
 
 struct DeviceMem
 {
@@ -36,49 +49,59 @@ struct KernelTimer
     std::unique_ptr<KernelTimerImpl> impl;
 };
 
-using device_stream_t = hipStream_t;
-
 template <typename... Args, typename F>
-void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
+float launch_and_time_kernel(const StreamConfig& stream_config,
+                             F kernel,
+                             dim3 grid_dim,
+                             dim3 block_dim,
+                             std::size_t lds_byte,
+                             Args... args)
 {
-    hipStream_t stream_id = nullptr;
-
-    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
-}
-
-template <typename... Args, typename F>
-float launch_and_time_kernel(
-    F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
-{
-    KernelTimer timer;
-
-    printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
-           __func__,
-           grid_dim.x,
-           grid_dim.y,
-           grid_dim.z,
-           block_dim.x,
-           block_dim.y,
-           block_dim.z);
-
-    printf("Warm up\n");
-
-    hipStream_t stream_id = nullptr;
-
-    // warm up
-    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
-
-    printf("Start running %d times...\n", nrepeat);
-
-    timer.Start();
-
-    for(int i = 0; i < nrepeat; ++i)
+#if CK_TIME_KERNEL
+    if(stream_config.time_kernel_)
     {
-        hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
+        printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
+               __func__,
+               grid_dim.x,
+               grid_dim.y,
+               grid_dim.z,
+               block_dim.x,
+               block_dim.y,
+               block_dim.z);
+
+        const int nrepeat = 10;
+
+        printf("Warm up 1 time\n");
+
+        // warm up
+        hipLaunchKernelGGL(
+            kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
+
+        printf("Start running %d times...\n", nrepeat);
+
+        KernelTimer timer;
+        timer.Start();
+
+        for(int i = 0; i < nrepeat; ++i)
+        {
+            hipLaunchKernelGGL(
+                kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
+        }
+
+        timer.End();
+
+        return timer.GetElapsedTime() / nrepeat;
     }
+    else
+    {
+        hipLaunchKernelGGL(
+            kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
 
-    timer.End();
+        return 0;
+    }
+#else
+    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
 
-    return timer.GetElapsedTime() / nrepeat;
-}
+    return 0;
 #endif
+}
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
index 3a706dac0b..f4944a28d2 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
@@ -84,7 +84,8 @@ struct ReferenceBatchedGemm : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
index c55b86aea7..79c0468c82 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
@@ -135,7 +135,8 @@ struct ReferenceCGemm : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
index c5f3cbad69..10619ae6d9 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp
@@ -121,7 +121,8 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
index 9e91f06e7f..45fc8b8503 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
@@ -291,7 +291,8 @@ struct ReferenceConvBwdData : public device::BaseOperator
             }
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
index 65e59db2f8..d1afa898e4 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -1,9 +1,10 @@
-#ifndef REFERENCE_CONV_FWD_HPP
-#define REFERENCE_CONV_FWD_HPP
+#pragma once
 
 #include <iostream>
 #include <type_traits>
 #include <sstream>
+
+#include "stream_config.hpp"
 #include "device_base.hpp"
 #include "host_tensor.hpp"
 
@@ -251,7 +252,8 @@ struct ReferenceConvFwd : public device::BaseOperator
             }
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
@@ -311,4 +313,3 @@ struct ReferenceConvFwd : public device::BaseOperator
 } // namespace host
 } // namespace tensor_operation
 } // namespace ck
-#endif
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
index ee95cd410a..4be6169c15 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
@@ -124,7 +124,8 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
index 11232cc98f..466537c686 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
@@ -130,7 +130,8 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index 1b49ca5740..d89c8f5e05 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -80,7 +80,8 @@ struct ReferenceGemm : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
index 7dd6fc9199..3e7f220e03 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
@@ -82,7 +82,8 @@ struct ReferenceGemmBias2D : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
index 7c9df272c2..60f72e9e51 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
@@ -85,7 +85,8 @@ struct ReferenceGemmBiasActivation : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
index 4d3c5effae..5e0ec75e5e 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
@@ -91,7 +91,8 @@ struct ReferenceGemmBiasActivationAdd : public device::BaseOperator
             return 0;
         }
 
-        float Run(const device::BaseArgument* p_arg, int) override
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
         {
             return Run(*dynamic_cast<const Argument*>(p_arg));
         }
diff --git a/library/include/ck/library/utility/conv_fwd_util.hpp b/library/include/ck/library/utility/conv_util.hpp
similarity index 95%
rename from library/include/ck/library/utility/conv_fwd_util.hpp
rename to library/include/ck/library/utility/conv_util.hpp
index a29eb814fd..c881b89705 100644
--- a/library/include/ck/library/utility/conv_fwd_util.hpp
+++ b/library/include/ck/library/utility/conv_util.hpp
@@ -146,19 +146,19 @@ struct ConvParams
                const std::vector<ck::index_t>& left_pads,
                const std::vector<ck::index_t>& right_pads);
 
-    ck::index_t num_dim_spatial;
-    ck::index_t N;
-    ck::index_t K;
-    ck::index_t C;
+    ck::index_t num_dim_spatial_;
+    ck::index_t N_;
+    ck::index_t K_;
+    ck::index_t C_;
 
-    std::vector<ck::index_t> filter_spatial_lengths;
-    std::vector<ck::index_t> input_spatial_lengths;
+    std::vector<ck::index_t> filter_spatial_lengths_;
+    std::vector<ck::index_t> input_spatial_lengths_;
 
-    std::vector<ck::index_t> conv_filter_strides;
-    std::vector<ck::index_t> conv_filter_dilations;
+    std::vector<ck::index_t> conv_filter_strides_;
+    std::vector<ck::index_t> conv_filter_dilations_;
 
-    std::vector<ck::index_t> input_left_pads;
-    std::vector<ck::index_t> input_right_pads;
+    std::vector<ck::index_t> input_left_pads_;
+    std::vector<ck::index_t> input_right_pads_;
 
     std::vector<ck::index_t> GetOutputSpatialLengths() const;
 };
@@ -268,10 +268,10 @@ void run_reference_convolution_forward(const ConvParams& params,
     auto ref_argument = ref_conv.MakeArgument(input,
                                               weights,
                                               output,
-                                              params.conv_filter_strides,
-                                              params.conv_filter_dilations,
-                                              params.input_left_pads,
-                                              params.input_right_pads,
+                                              params.conv_filter_strides_,
+                                              params.conv_filter_dilations_,
+                                              params.input_left_pads_,
+                                              params.input_right_pads_,
                                               PassThrough{},
                                               PassThrough{},
                                               PassThrough{});
@@ -437,17 +437,17 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
 
     virtual InTensorsTuple GetInputTensors() const override
     {
-        std::vector<std::size_t> input_dims{static_cast<std::size_t>(params_.N),
-                                            static_cast<std::size_t>(params_.C)};
+        std::vector<std::size_t> input_dims{static_cast<std::size_t>(params_.N_),
+                                            static_cast<std::size_t>(params_.C_)};
         input_dims.insert(std::end(input_dims),
-                          std::begin(params_.input_spatial_lengths),
-                          std::end(params_.input_spatial_lengths));
+                          std::begin(params_.input_spatial_lengths_),
+                          std::end(params_.input_spatial_lengths_));
 
-        std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params_.K),
-                                             static_cast<std::size_t>(params_.C)};
+        std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params_.K_),
+                                             static_cast<std::size_t>(params_.C_)};
         filter_dims.insert(std::end(filter_dims),
-                           std::begin(params_.filter_spatial_lengths),
-                           std::end(params_.filter_spatial_lengths));
+                           std::begin(params_.filter_spatial_lengths_),
+                           std::end(params_.filter_spatial_lengths_));
 
         auto input = std::make_unique<Tensor<InDataType>>(
             get_host_tensor_descriptor(input_dims, InLayout{}));
@@ -465,8 +465,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
 
     virtual TensorPtr<OutDataType> GetOutputTensor() const override
     {
-        std::vector<std::size_t> output_dims{static_cast<std::size_t>(params_.N),
-                                             static_cast<std::size_t>(params_.K)};
+        std::vector<std::size_t> output_dims{static_cast<std::size_t>(params_.N_),
+                                             static_cast<std::size_t>(params_.K_)};
         output_dims.insert(std::end(output_dims),
                            std::begin(output_spatial_lengths_),
                            std::end(output_spatial_lengths_));
@@ -522,16 +522,16 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
             static_cast<InDataType*>(in_device_buffers[0]->GetDeviceBuffer()),
             static_cast<WeiDataType*>(in_device_buffers[1]->GetDeviceBuffer()),
             static_cast<OutDataType*>(out_device_buffer->GetDeviceBuffer()),
-            params_.N,
-            params_.K,
-            params_.C,
-            params_.input_spatial_lengths,
-            params_.filter_spatial_lengths,
+            params_.N_,
+            params_.K_,
+            params_.C_,
+            params_.input_spatial_lengths_,
+            params_.filter_spatial_lengths_,
             output_spatial_lengths_,
-            params_.conv_filter_strides,
-            params_.conv_filter_dilations,
-            params_.input_left_pads,
-            params_.input_right_pads,
+            params_.conv_filter_strides_,
+            params_.conv_filter_dilations_,
+            params_.input_left_pads_,
+            params_.input_right_pads_,
             InElementwiseOp{},
             WeiElementwiseOp{},
             OutElementwiseOp{});
@@ -539,20 +539,20 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
 
     virtual std::size_t GetFlops() const override
     {
-        return get_flops(params_.N,
-                         params_.C,
-                         params_.K,
-                         params_.filter_spatial_lengths,
+        return get_flops(params_.N_,
+                         params_.C_,
+                         params_.K_,
+                         params_.filter_spatial_lengths_,
                          output_spatial_lengths_);
     }
 
     virtual std::size_t GetBtype() const override
     {
-        return get_btype<InDataType, WeiDataType, OutDataType>(params_.N,
-                                                               params_.C,
-                                                               params_.K,
-                                                               params_.input_spatial_lengths,
-                                                               params_.filter_spatial_lengths,
+        return get_btype<InDataType, WeiDataType, OutDataType>(params_.N_,
+                                                               params_.C_,
+                                                               params_.K_,
+                                                               params_.input_spatial_lengths_,
+                                                               params_.filter_spatial_lengths_,
                                                                output_spatial_lengths_);
     }
 
diff --git a/library/include/ck/library/utility/op_instance_engine.hpp b/library/include/ck/library/utility/op_instance_engine.hpp
index ec88b4e1b9..5429f66d3e 100644
--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
@@ -128,7 +128,7 @@ class OpInstanceRunEngine
 
     template <typename OpInstancePtr>
     ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
-                              int nrepeat          = 100,
+                              bool time_kernel     = false,
                               bool do_verification = false,
                               bool do_log          = false)
     {
@@ -143,7 +143,7 @@ class OpInstanceRunEngine
             if(op_ptr->IsSupportedArgument(argument.get()))
             {
                 std::string op_name = op_ptr->GetTypeString();
-                float avg_time      = invoker->Run(argument.get(), nrepeat);
+                float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
                 std::size_t flops     = op_instance_.GetFlops();
                 std::size_t num_btype = op_instance_.GetBtype();
diff --git a/library/src/host_tensor/CMakeLists.txt b/library/src/host_tensor/CMakeLists.txt
index fd100e477f..2a020b763d 100644
--- a/library/src/host_tensor/CMakeLists.txt
+++ b/library/src/host_tensor/CMakeLists.txt
@@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE
     host_tensor.cpp
 )
 
-add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE})
+add_library(host_tensor STATIC ${HOST_TENSOR_SOURCE})
+add_library(composable_kernel::host_tensor ALIAS host_tensor)
+
 target_compile_features(host_tensor PUBLIC)
 set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
-install(TARGETS host_tensor LIBRARY DESTINATION lib)
+
+target_include_directories(host_tensor PUBLIC 
+    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
+    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>"
+    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>"
+)
+
+install(TARGETS host_tensor 
+        EXPORT host_tensorTargets
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
+install(EXPORT host_tensorTargets
+    FILE composable_kernelhost_tensorTargets.cmake 
+    NAMESPACE composable_kernel::
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
 
 clang_tidy_check(host_tensor)
diff --git a/library/src/host_tensor/device.cpp b/library/src/host_tensor/device.cpp
index 3e80df80fb..9f0d982dbc 100644
--- a/library/src/host_tensor/device.cpp
+++ b/library/src/host_tensor/device.cpp
@@ -2,7 +2,7 @@
 
 DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
 {
-    hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+    hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
 }
 
 void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
@@ -11,49 +11,48 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
 
 void DeviceMem::ToDevice(const void* p)
 {
-    hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+    hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
 }
 
 void DeviceMem::FromDevice(void* p)
 {
-    hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+    hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
 }
 
-void DeviceMem::SetZero() { hipGetErrorString(hipMemset(mpDeviceBuf, 0, mMemSize)); }
+void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
 
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
 
 struct KernelTimerImpl
 {
     KernelTimerImpl()
     {
-        hipGetErrorString(hipEventCreate(&mStart));
-        hipGetErrorString(hipEventCreate(&mEnd));
+        hip_check_error(hipEventCreate(&mStart));
+        hip_check_error(hipEventCreate(&mEnd));
     }
 
     ~KernelTimerImpl()
     {
-        hipGetErrorString(hipEventDestroy(mStart));
-        hipGetErrorString(hipEventDestroy(mEnd));
+        hip_check_error(hipEventDestroy(mStart));
+        hip_check_error(hipEventDestroy(mEnd));
     }
 
     void Start()
     {
-        hipGetErrorString(hipDeviceSynchronize());
-        hipGetErrorString(hipEventRecord(mStart, nullptr));
+        hip_check_error(hipDeviceSynchronize());
+        hip_check_error(hipEventRecord(mStart, nullptr));
     }
 
     void End()
     {
-        hipGetErrorString(hipEventRecord(mEnd, nullptr));
-        hipGetErrorString(hipEventSynchronize(mEnd));
+        hip_check_error(hipEventRecord(mEnd, nullptr));
+        hip_check_error(hipEventSynchronize(mEnd));
     }
 
     float GetElapsedTime() const
     {
         float time;
-        hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
+        hip_check_error(hipEventElapsedTime(&time, mStart, mEnd));
         return time;
     }
 
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 9d4c19db90..66dfa7c605 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -11,6 +11,7 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
     ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
     ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
+    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host
     ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
     ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
     ${PROJECT_SOURCE_DIR}/external/include/half
@@ -18,7 +19,7 @@ include_directories(BEFORE
 
 function(add_instance_library INSTANCE_NAME)
     message("adding instance ${INSTANCE_NAME}")
-    add_library(${INSTANCE_NAME} SHARED ${ARGN}) 
+    add_library(${INSTANCE_NAME} OBJECT ${ARGN}) 
     target_compile_features(${INSTANCE_NAME} PUBLIC)
     set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endfunction(add_instance_library INSTANCE_NAME)
@@ -42,3 +43,74 @@ add_subdirectory(grouped_gemm)
 add_subdirectory(conv2d_bwd_weight)
 add_subdirectory(batched_gemm_reduce)
 add_subdirectory(cgemm)
+
+add_library(device_operations STATIC 
+    $<TARGET_OBJECTS:device_conv1d_fwd_instance> 
+    $<TARGET_OBJECTS:device_batched_gemm_instance> 
+    $<TARGET_OBJECTS:device_conv2d_bwd_data_instance> 
+    $<TARGET_OBJECTS:device_conv2d_fwd_instance> 
+    $<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_instance> 
+    $<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_add_instance>
+    $<TARGET_OBJECTS:device_conv2d_fwd_bias_relu_atomic_add_instance>
+    $<TARGET_OBJECTS:device_gemm_instance>
+    $<TARGET_OBJECTS:device_gemm_bias_relu_instance>
+    $<TARGET_OBJECTS:device_gemm_bias_relu_add_instance>
+    $<TARGET_OBJECTS:device_gemm_bias2d_instance>
+    $<TARGET_OBJECTS:device_reduce_instance>
+    $<TARGET_OBJECTS:device_convnd_bwd_data_instance>
+    $<TARGET_OBJECTS:device_grouped_gemm_instance>
+    $<TARGET_OBJECTS:device_conv2d_bwd_weight_instance>
+    $<TARGET_OBJECTS:device_batched_gemm_reduce_instance>
+    $<TARGET_OBJECTS:device_conv3d_fwd_instance>
+    $<TARGET_OBJECTS:device_cgemm_instance>
+    device_conv2d.cpp
+)
+add_library(composablekernels::device_operations ALIAS device_operations)
+
+
+set(DEV_OPS_INC_DIRS 
+    ${PROJECT_SOURCE_DIR}/include/ck/
+    ${PROJECT_SOURCE_DIR}/library/include/ck/
+    ${PROJECT_SOURCE_DIR}/external/include/
+)
+target_compile_features(device_operations PUBLIC)
+set_target_properties(device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(device_operations PUBLIC 
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_description>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/problem_transform>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/grid>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/block>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/warp>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/thread>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/element>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/half>
+)
+
+#once new arches are enabled make this an option on the main cmake file
+# and pass down here to be exported
+
+target_compile_options(device_operations
+PRIVATE --offload-arch=gfx908
+)
+# install(TARGETS device_operations LIBRARY DESTINATION lib)
+install(TARGETS device_operations
+        EXPORT device_operationsTargets
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
+install(EXPORT device_operationsTargets 
+        FILE composable_kerneldevice_operationsTargets.cmake 
+        NAMESPACE composable_kernel::
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
index 35e24462b5..016c85f673 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
@@ -18,9 +18,9 @@ set(DEVICE_BATCHED_GEMM_INSTANCE_SOURCE
    device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp;
 )
 
-add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
-target_compile_features(device_batched_gemm_instance PUBLIC)
+add_library(device_batched_gemm_instance OBJECT ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
+# target_compile_features(device_batched_gemm_instance PUBLIC)
 set_target_properties(device_batched_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
+# install(TARGETS device_batched_gemm_instance LIBRARY DESTINATION lib)
 
 clang_tidy_check(device_batched_gemm_instance)
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
index 59eb6cb1cc..67a3c15d00 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
@@ -5,7 +5,8 @@ set(DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE
     device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
 )
 
-add_instance_library(device_batched_gemm_reduce_instance ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE})
-install(TARGETS device_batched_gemm_reduce_instance LIBRARY DESTINATION lib)
+add_instance_library(device_batched_gemm_reduce_instance OBJECT ${DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE})
+target_compile_features(device_batched_gemm_reduce_instance PUBLIC)
+set_target_properties(device_batched_gemm_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
 clang_tidy_check(device_batched_gemm_reduce_instance)
 
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt
index 6c7c3e4f78..77aa6198f5 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv1d_fwd/CMakeLists.txt
@@ -6,9 +6,9 @@ set(DEVICE_CONV1D_FWD_INSTANCE_SOURCE
    device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instance.cpp;
 )
 
-add_library(device_conv1d_fwd_instance SHARED ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE}) 
-target_compile_features(device_conv1d_fwd_instance PUBLIC)
+add_library(device_conv1d_fwd_instance OBJECT ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE}) 
+# target_compile_features(device_conv1d_fwd_instance PUBLIC)
 set_target_properties(device_conv1d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib) 
+# install(TARGETS device_conv1d_fwd_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv1d_fwd_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
index d619ef4bf1..d7882a7d8b 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE
    device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
 ) 
 
-add_library(device_conv2d_bwd_data_instance SHARED ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})
-target_compile_features(device_conv2d_bwd_data_instance PUBLIC)
+add_library(device_conv2d_bwd_data_instance OBJECT ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})
 set_target_properties(device_conv2d_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_bwd_data_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv2d_bwd_data_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt
index 6183e70b9b..7c384a882b 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_weight/CMakeLists.txt
@@ -3,7 +3,7 @@ set(DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE
    device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instance.cpp;
    device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instance.cpp;
 )
-add_library(device_conv2d_bwd_weight_instance SHARED ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE}) 
+add_library(device_conv2d_bwd_weight_instance OBJECT ${DEVICE_CONV2D_BWD_WEIGHT_INSTANCE_SOURCE}) 
 target_compile_features(device_conv2d_bwd_weight_instance PUBLIC)
 set_target_properties(device_conv2d_bwd_weight_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
 install(TARGETS device_conv2d_bwd_weight_instance LIBRARY DESTINATION lib) 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
index 7483861524..857e36d6f5 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_CONV2D_FWD_INSTANCE_SOURCE
    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
    device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp;
 )
-add_library(device_conv2d_fwd_instance SHARED ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE}) 
-target_compile_features(device_conv2d_fwd_instance PUBLIC)
+add_library(device_conv2d_fwd_instance OBJECT ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE}) 
 set_target_properties(device_conv2d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv2d_fwd_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
index 27a9736a3f..ad66c73bf8 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
@@ -2,9 +2,7 @@
 set(DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE
    device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp;
 )
-add_library(device_conv2d_fwd_bias_relu_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE}) 
-target_compile_features(device_conv2d_fwd_bias_relu_instance PUBLIC)
+add_library(device_conv2d_fwd_bias_relu_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE}) 
 set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv2d_fwd_bias_relu_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
index d7bec82174..36b1f6c153 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
@@ -2,9 +2,7 @@
 set(DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE
    device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp;
 )
-add_library(device_conv2d_fwd_bias_relu_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE}) 
-target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC)
+add_library(device_conv2d_fwd_bias_relu_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE}) 
 set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv2d_fwd_bias_relu_add_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt
index c0942d5485..5906c7c5ac 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_atomic_add/CMakeLists.txt
@@ -3,9 +3,7 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE
    device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp;
 )
 
-add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) 
-target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC)
+add_library(device_conv2d_fwd_bias_relu_atomic_add_instance OBJECT ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) 
 set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv2d_fwd_bias_relu_atomic_add_instance)
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt
index f6849a7bb2..91a299c742 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/conv3d_fwd/CMakeLists.txt
@@ -5,9 +5,8 @@ set(DEVICE_CONV3D_FWD_INSTANCE_SOURCE
    device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp;
    device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
 )
-add_library(device_conv3d_fwd_instance SHARED ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE}) 
+add_library(device_conv3d_fwd_instance OBJECT ${DEVICE_CONV3D_FWD_INSTANCE_SOURCE}) 
 target_compile_features(device_conv3d_fwd_instance PUBLIC)
 set_target_properties(device_conv3d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_conv3d_fwd_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_conv3d_fwd_instance)
diff --git a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
index 9ee961ad74..037f860808 100644
--- a/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/convnd_bwd_data/CMakeLists.txt
@@ -14,7 +14,7 @@ set(DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE
    device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp;
 ) 
 
-add_library(device_convnd_bwd_data_instance SHARED ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE})
+add_library(device_convnd_bwd_data_instance OBJECT ${DEVICE_CONVND_BWD_DATA_INSTANCE_SOURCE})
 target_compile_features(device_convnd_bwd_data_instance PUBLIC)
 set_target_properties(device_convnd_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
 install(TARGETS device_convnd_bwd_data_instance LIBRARY DESTINATION lib) 
diff --git a/library/src/tensor_operation_instance/gpu/device_conv2d.cpp b/library/src/tensor_operation_instance/gpu/device_conv2d.cpp
new file mode 100644
index 0000000000..6b99433ffa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/device_conv2d.cpp
@@ -0,0 +1,201 @@
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+#include "host_interface.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_fwd_instance {
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+} // namespace device_conv2d_fwd_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+struct DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl
+{
+    std::unique_ptr<DeviceConvFwdPtr_t::BaseArgument>
+    MakeArgumentPointer(void* in_ptr,
+                        void* wei_ptr,
+                        void* out_ptr,
+                        size_t N,
+                        size_t K,
+                        size_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads) const
+    {
+        return el->MakeArgumentPointer(in_ptr,
+                                       wei_ptr,
+                                       out_ptr,
+                                       N,
+                                       K,
+                                       C,
+                                       input_spatial_lengths,
+                                       filter_spatial_lengths,
+                                       output_spatial_lengths,
+                                       conv_filter_strides,
+                                       conv_filter_dilations,
+                                       input_left_pads,
+                                       input_right_pads,
+                                       PassThrough{},
+                                       PassThrough{},
+                                       PassThrough{});
+    }
+    std::unique_ptr<DeviceConvFwdPtr_t::BaseInvoker> MakeInvokerPointer() const
+    {
+        return el->MakeInvokerPointer();
+    }
+
+    std::string GetTypeString() { return el->GetTypeString(); }
+    bool IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg)
+    {
+        return el->IsSupportedArgument(arg);
+    }
+
+    ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough> el;
+};
+
+DeviceConvFwdPtr_t::DeviceConvFwdPtr_t() : pImpl(nullptr) {}
+DeviceConvFwdPtr_t::~DeviceConvFwdPtr_t()                    = default;
+DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t&&) = default;
+DeviceConvFwdPtr_t::DeviceConvFwdPtr_t(DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl& other)
+    : pImpl(std::make_unique<DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl>(std::move(other)))
+{
+}
+
+std::unique_ptr<DeviceConvFwdPtr_t::BaseArgument>
+DeviceConvFwdPtr_t::MakeArgumentPointer(void* in_ptr,
+                                        void* wei_ptr,
+                                        void* out_ptr,
+                                        size_t N,
+                                        size_t K,
+                                        size_t C,
+                                        std::vector<ck::index_t> input_spatial_lengths,
+                                        std::vector<ck::index_t> filter_spatial_lengths,
+                                        std::vector<ck::index_t> output_spatial_lengths,
+                                        std::vector<ck::index_t> conv_filter_strides,
+                                        std::vector<ck::index_t> conv_filter_dilations,
+                                        std::vector<ck::index_t> input_left_pads,
+                                        std::vector<ck::index_t> input_right_pads) const
+{
+    return pImpl->MakeArgumentPointer(in_ptr,
+                                      wei_ptr,
+                                      out_ptr,
+                                      N,
+                                      K,
+                                      C,
+                                      input_spatial_lengths,
+                                      filter_spatial_lengths,
+                                      output_spatial_lengths,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads);
+}
+
+std::unique_ptr<DeviceConvFwdPtr_t::BaseInvoker> DeviceConvFwdPtr_t::MakeInvokerPointer() const
+{
+    return pImpl->MakeInvokerPointer();
+}
+
+std::string DeviceConvFwdPtr_t::GetTypeString() { return pImpl->GetTypeString(); }
+bool DeviceConvFwdPtr_t::IsSupportedArgument(const DeviceConvFwdPtr_t::BaseArgument* arg_ptr)
+{
+    return pImpl->IsSupportedArgument(arg_ptr);
+}
+
+using namespace ck::tensor_operation::device::device_conv2d_fwd_instance;
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances)
+{
+    std::vector<
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
+        local_instances;
+    add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(local_instances);
+    for(auto& kinder : local_instances)
+    {
+        DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+        instances.emplace_back(tmp);
+    }
+    return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances)
+{
+    std::vector<
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
+        local_instances;
+    add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(local_instances);
+    for(auto& kinder : local_instances)
+    {
+        DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+        instances.emplace_back(tmp); // Perhaps we can do better
+    }
+    return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances)
+{
+    std::vector<
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
+        local_instances;
+    add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(local_instances);
+    for(auto& kinder : local_instances)
+    {
+        DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+        instances.emplace_back(tmp); // Perhaps we can do better
+    }
+    return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances)
+{
+    std::vector<
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
+        local_instances;
+    add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(local_instances);
+    for(auto& kinder : local_instances)
+    {
+        DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+        instances.emplace_back(tmp); // Perhaps we can do better
+    }
+    return;
+}
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(
+    std::vector<DeviceConvFwdPtr_t>& instances)
+{
+    std::vector<
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>
+        local_instances;
+    add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(local_instances);
+    for(auto& kinder : local_instances)
+    {
+        DeviceConvFwdPtr_t::DeviceConvFwdPtrImpl tmp{std::move(kinder)};
+        instances.emplace_back(tmp);
+    }
+    return;
+}
diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
index 5f057adcc5..556b06d7e1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -35,10 +35,9 @@ set(DEVICE_GEMM_INSTANCE_SOURCE
    device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp;
 )
 
-add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE})
+add_library(device_gemm_instance OBJECT ${DEVICE_GEMM_INSTANCE_SOURCE})
 
 target_compile_features(device_gemm_instance PUBLIC)
 set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_instance LIBRARY DESTINATION lib)
 
 clang_tidy_check(device_gemm_instance)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
index a0e5ba61a1..e2b0abb1d1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias2d/CMakeLists.txt
@@ -10,9 +10,7 @@ set(DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE
    device_gemm_xdl_c_shuffle_bias_2d_f16_f16_f16_mk_nk_mn_instance.cpp;
 )
 
-add_library(device_gemm_bias2d_instance SHARED ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE})
-target_compile_features(device_gemm_bias2d_instance PUBLIC)
+add_library(device_gemm_bias2d_instance OBJECT ${DEVICE_GEMM_BIAS2D_INSTANCE_SOURCE})
 set_target_properties(device_gemm_bias2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_bias2d_instance LIBRARY DESTINATION lib)
 
 clang_tidy_check(device_gemm_bias2d_instance)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
index 69e05673d6..e2e7d4badd 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE
    device_gemm_xdl_c_shuffle_bias_relu_f16_f16_f16_km_nk_mn_instance.cpp;
 )
 
-add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE}) 
-target_compile_features(device_gemm_bias_relu_instance PUBLIC)
+add_library(device_gemm_bias_relu_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE}) 
 set_target_properties(device_gemm_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_bias_relu_instance LIBRARY DESTINATION lib)
 
 clang_tidy_check(device_gemm_bias_relu_instance)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
index 016bc4be2d..a10dbb555d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_relu_add/CMakeLists.txt
@@ -6,9 +6,7 @@ set(DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE
    device_gemm_xdl_c_shuffle_bias_relu_add_f16_f16_f16_km_nk_mn_instance.cpp;
 )
 
-add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE}) 
-target_compile_features(device_gemm_bias_relu_add_instance PUBLIC)
+add_library(device_gemm_bias_relu_add_instance OBJECT ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE}) 
 set_target_properties(device_gemm_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_gemm_bias_relu_add_instance LIBRARY DESTINATION lib)
 
 clang_tidy_check(device_gemm_bias_relu_add_instance)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
index 8f591d8c49..6c5e31fddd 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
@@ -6,7 +6,7 @@ set(DEVICE_GROUPED_GEMM_INSTANCE_SOURCE
    device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp;
 )
 
-add_library(device_grouped_gemm_instance SHARED ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE}) 
+add_library(device_grouped_gemm_instance OBJECT ${DEVICE_GROUPED_GEMM_INSTANCE_SOURCE}) 
 
 target_compile_features(device_grouped_gemm_instance PUBLIC)
 set_target_properties(device_grouped_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
index cced3a4b76..81987ac0d4 100644
--- a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
@@ -38,9 +38,7 @@ set(DEVICE_REDUCE_INSTANCE_SOURCE
    device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.cpp;
 )
 
-add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE}) 
-target_compile_features(device_reduce_instance PUBLIC)
+add_library(device_reduce_instance OBJECT ${DEVICE_REDUCE_INSTANCE_SOURCE}) 
 set_target_properties(device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
-install(TARGETS device_reduce_instance LIBRARY DESTINATION lib) 
 
 clang_tidy_check(device_reduce_instance)
diff --git a/library/src/utility/CMakeLists.txt b/library/src/utility/CMakeLists.txt
index 3580ba1a8f..0914855d59 100644
--- a/library/src/utility/CMakeLists.txt
+++ b/library/src/utility/CMakeLists.txt
@@ -8,14 +8,14 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/library/include/ck/library/utility
 )
 
-set(CONV_FWD_UTIL_SOURCE
-    conv_fwd_util.cpp
+set(CONV_UTIL_SOURCE
+    conv_util.cpp
 )
 
-add_library(conv_fwd_util SHARED ${CONV_FWD_UTIL_SOURCE})
-target_link_libraries(conv_fwd_util PRIVATE host_tensor)
-target_compile_features(conv_fwd_util PUBLIC)
-set_target_properties(conv_fwd_util PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_include_directories(conv_fwd_util SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
+add_library(conv_util SHARED ${CONV_UTIL_SOURCE})
+target_link_libraries(conv_util PRIVATE host_tensor)
+target_compile_features(conv_util PUBLIC)
+set_target_properties(conv_util PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(conv_util SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
 
-clang_tidy_check(conv_fwd_util)
+clang_tidy_check(conv_util)
diff --git a/library/src/utility/conv_fwd_util.cpp b/library/src/utility/conv_util.cpp
similarity index 62%
rename from library/src/utility/conv_fwd_util.cpp
rename to library/src/utility/conv_util.cpp
index 01bfeda16d..a60d1a3495 100644
--- a/library/src/utility/conv_fwd_util.cpp
+++ b/library/src/utility/conv_util.cpp
@@ -1,5 +1,5 @@
 
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 
 namespace ck {
 namespace utils {
@@ -37,16 +37,16 @@ std::size_t get_flops(ck::index_t N,
 }
 
 ConvParams::ConvParams()
-    : num_dim_spatial(2),
-      N(128),
-      K(256),
-      C(192),
-      filter_spatial_lengths(2, 3),
-      input_spatial_lengths(2, 71),
-      conv_filter_strides(2, 2),
-      conv_filter_dilations(2, 1),
-      input_left_pads(2, 1),
-      input_right_pads(2, 1)
+    : num_dim_spatial_(2),
+      N_(128),
+      K_(256),
+      C_(192),
+      filter_spatial_lengths_(2, 3),
+      input_spatial_lengths_(2, 71),
+      conv_filter_strides_(2, 2),
+      conv_filter_dilations_(2, 1),
+      input_left_pads_(2, 1),
+      input_right_pads_(2, 1)
 {
 }
 
@@ -60,23 +60,23 @@ ConvParams::ConvParams(ck::index_t n_dim,
                        const std::vector<ck::index_t>& dilations,
                        const std::vector<ck::index_t>& left_pads,
                        const std::vector<ck::index_t>& right_pads)
-    : num_dim_spatial(n_dim),
-      N(n_batch),
-      K(n_out_channels),
-      C(n_in_channels),
-      filter_spatial_lengths(filters_len),
-      input_spatial_lengths(input_len),
-      conv_filter_strides(strides),
-      conv_filter_dilations(dilations),
-      input_left_pads(left_pads),
-      input_right_pads(right_pads)
+    : num_dim_spatial_(n_dim),
+      N_(n_batch),
+      K_(n_out_channels),
+      C_(n_in_channels),
+      filter_spatial_lengths_(filters_len),
+      input_spatial_lengths_(input_len),
+      conv_filter_strides_(strides),
+      conv_filter_dilations_(dilations),
+      input_left_pads_(left_pads),
+      input_right_pads_(right_pads)
 {
-    if(ck::type_convert<ck::index_t>(filter_spatial_lengths.size()) != num_dim_spatial ||
-       ck::type_convert<ck::index_t>(input_spatial_lengths.size()) != num_dim_spatial ||
-       ck::type_convert<ck::index_t>(conv_filter_strides.size()) != num_dim_spatial ||
-       ck::type_convert<ck::index_t>(conv_filter_dilations.size()) != num_dim_spatial ||
-       ck::type_convert<ck::index_t>(input_left_pads.size()) != num_dim_spatial ||
-       ck::type_convert<ck::index_t>(input_right_pads.size()) != num_dim_spatial)
+    if(ck::type_convert<ck::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
+       ck::type_convert<ck::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
+       ck::type_convert<ck::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
+       ck::type_convert<ck::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
+       ck::type_convert<ck::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
+       ck::type_convert<ck::index_t>(input_right_pads_.size()) != num_dim_spatial_)
     {
         throw(
             std::runtime_error("ConvParams::GetOutputSpatialLengths: "
@@ -86,27 +86,28 @@ ConvParams::ConvParams(ck::index_t n_dim,
 
 std::vector<ck::index_t> ConvParams::GetOutputSpatialLengths() const
 {
-    if(ck::type_convert<ck::index_t>(filter_spatial_lengths.size()) != num_dim_spatial ||
-       ck::type_convert<ck::index_t>(input_spatial_lengths.size()) != num_dim_spatial ||
-       ck::type_convert<ck::index_t>(conv_filter_strides.size()) != num_dim_spatial ||
-       ck::type_convert<ck::index_t>(conv_filter_dilations.size()) != num_dim_spatial ||
-       ck::type_convert<ck::index_t>(input_left_pads.size()) != num_dim_spatial ||
-       ck::type_convert<ck::index_t>(input_right_pads.size()) != num_dim_spatial)
+    if(ck::type_convert<ck::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
+       ck::type_convert<ck::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
+       ck::type_convert<ck::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
+       ck::type_convert<ck::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
+       ck::type_convert<ck::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
+       ck::type_convert<ck::index_t>(input_right_pads_.size()) != num_dim_spatial_)
     {
         throw(
             std::runtime_error("ConvParams::GetOutputSpatialLengths: "
                                "parameter size is different from number of declared dimensions!"));
     }
 
-    std::vector<ck::index_t> out_spatial_len(num_dim_spatial, 0);
-    for(ck::index_t i = 0; i < num_dim_spatial; ++i)
+    std::vector<ck::index_t> out_spatial_len(num_dim_spatial_, 0);
+    for(ck::index_t i = 0; i < num_dim_spatial_; ++i)
     {
         // XEff = (X - 1) * conv_dilation_w + 1;
         // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-        const ck::index_t idx_eff = (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1;
+        const ck::index_t idx_eff =
+            (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
         out_spatial_len[i] =
-            (input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) /
-                conv_filter_strides[i] +
+            (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - idx_eff) /
+                conv_filter_strides_[i] +
             1;
     }
     return out_spatial_len;
@@ -116,40 +117,40 @@ ConvParams parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[
 {
     ck::utils::conv::ConvParams params;
 
-    params.num_dim_spatial = num_dim_spatial;
-    params.N               = std::stoi(argv[arg_idx++]);
-    params.K               = std::stoi(argv[arg_idx++]);
-    params.C               = std::stoi(argv[arg_idx++]);
+    params.num_dim_spatial_ = num_dim_spatial;
+    params.N_               = std::stoi(argv[arg_idx++]);
+    params.K_               = std::stoi(argv[arg_idx++]);
+    params.C_               = std::stoi(argv[arg_idx++]);
 
-    params.filter_spatial_lengths.resize(num_dim_spatial);
+    params.filter_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_spatial_lengths.resize(num_dim_spatial);
+    params.input_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_strides.resize(num_dim_spatial);
+    params.conv_filter_strides_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_dilations.resize(num_dim_spatial);
+    params.conv_filter_dilations_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_left_pads.resize(num_dim_spatial);
+    params.input_left_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_right_pads.resize(num_dim_spatial);
+    params.input_right_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
     }
 
     return params;
@@ -228,12 +229,12 @@ HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::siz
 std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParams& p)
 {
     os << "ConvParams {"
-       << "\nnum_dim_spatial: " << p.num_dim_spatial << "\nN: " << p.N << "\nK: " << p.K
-       << "\nC: " << p.C << "\nfilter_spatial_lengths: " << p.filter_spatial_lengths
-       << "\ninput_spatial_lengths: " << p.input_spatial_lengths
-       << "\nconv_filter_strides: " << p.conv_filter_strides
-       << "\nconv_filter_dilations: " << p.conv_filter_dilations
-       << "\ninput_left_pads: " << p.input_left_pads
-       << "\ninput_right_pads: " << p.input_right_pads;
+       << "\nnum_dim_spatial: " << p.num_dim_spatial_ << "\nN: " << p.N_ << "\nK: " << p.K_
+       << "\nC: " << p.C_ << "\nfilter_spatial_lengths: " << p.filter_spatial_lengths_
+       << "\ninput_spatial_lengths: " << p.input_spatial_lengths_
+       << "\nconv_filter_strides: " << p.conv_filter_strides_
+       << "\nconv_filter_dilations: " << p.conv_filter_dilations_
+       << "\ninput_left_pads: " << p.input_left_pads_
+       << "\ninput_right_pads: " << p.input_right_pads_;
     return os;
 }
diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt
index dd8ebe306d..0525733103 100644
--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
@@ -43,7 +43,7 @@ set(PROFILER_SOURCE
 add_executable(ckProfiler ${PROFILER_SOURCE})
 
 target_link_libraries(ckProfiler PRIVATE host_tensor)
-target_link_libraries(ckProfiler PRIVATE conv_fwd_util)
+target_link_libraries(ckProfiler PRIVATE conv_util)
 target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance)
 target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
 target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance)
diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profile_batched_gemm_impl.hpp
index 7abbf7a042..3393110c33 100644
--- a/profiler/include/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profile_batched_gemm_impl.hpp
@@ -63,7 +63,7 @@ template <typename ADataType,
 bool profile_batched_gemm_impl(int do_verification,
                                int init_method,
                                bool do_log,
-                               int nrepeat,
+                               bool time_kernel,
                                int M,
                                int N,
                                int K,
@@ -356,11 +356,12 @@ bool profile_batched_gemm_impl(int do_verification,
         {
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
 
-            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
+            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
                                      sizeof(CDataType) * M * N) *
                                     BatchCount;
 
diff --git a/profiler/include/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profile_batched_gemm_reduce_impl.hpp
index a6399c20d8..bd74dbf459 100644
--- a/profiler/include/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp
@@ -53,7 +53,7 @@ template <typename ADataType,
 bool profile_batched_gemm_reduce_impl(int do_verification,
                                       int init_method,
                                       bool do_log,
-                                      int nrepeat,
+                                      bool time_kernel,
                                       int M,
                                       int N,
                                       int K,
@@ -259,30 +259,12 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
 
         if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
         {
-            // warm up
-            invoker_ptr->Run(argument_ptr.get());
+            // init DO, D1 to 0
+            d0_device_buf.SetZero();
+            d1_device_buf.SetZero();
 
-            // timing
-            float total_time = 0;
-
-            for(int i = 0; i < nrepeat; ++i)
-            {
-                // init DO, D1 to 0
-                d0_device_buf.SetZero();
-                d1_device_buf.SetZero();
-
-                KernelTimer timer;
-
-                timer.Start();
-
-                invoker_ptr->Run(argument_ptr.get());
-
-                timer.End();
-
-                total_time += timer.GetElapsedTime();
-            }
-
-            float ave_time = total_time / nrepeat;
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::string gemm_name = gemm_ptr->GetTypeString();
 
diff --git a/profiler/include/profile_conv_bwd_data_impl.hpp b/profiler/include/profile_conv_bwd_data_impl.hpp
index bec97e40f5..dfec033737 100644
--- a/profiler/include/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profile_conv_bwd_data_impl.hpp
@@ -51,7 +51,7 @@ template <int NDimSpatial,
 void profile_conv_bwd_data_impl(int do_verification,
                                 int init_method,
                                 bool do_log,
-                                int nrepeat,
+                                bool time_kernel,
                                 ck::index_t N,
                                 ck::index_t K,
                                 ck::index_t C,
@@ -228,7 +228,8 @@ void profile_conv_bwd_data_impl(int do_verification,
         {
             std::string conv_name = conv_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamControl{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
diff --git a/profiler/include/profile_conv_bwd_weight_impl.hpp b/profiler/include/profile_conv_bwd_weight_impl.hpp
index 20fe0ef549..8e3a4074b0 100644
--- a/profiler/include/profile_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profile_conv_bwd_weight_impl.hpp
@@ -1,4 +1,6 @@
 #pragma once
+
+#include "stream_config.hpp"
 #include "config.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
@@ -43,7 +45,7 @@ template <int NDimSpatial,
 bool profile_conv_bwd_weight_impl(int do_verification,
                                   int init_method,
                                   bool do_log,
-                                  int nrepeat,
+                                  bool time_kernel,
                                   ck::index_t N,
                                   ck::index_t K,
                                   ck::index_t C,
@@ -182,6 +184,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
 
     // profile device Conv instances
     bool pass = true;
+
     for(auto& conv_ptr : conv_ptrs)
     {
         // using atomic, so need to reset input
@@ -189,6 +192,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
         {
             wei_device_buf.SetZero();
         }
+
         auto argument_ptr = conv_ptr->MakeArgumentPointer(
             static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
             static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
@@ -214,7 +218,8 @@ bool profile_conv_bwd_weight_impl(int do_verification,
         {
             std::string conv_name = conv_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
@@ -242,6 +247,7 @@ bool profile_conv_bwd_weight_impl(int do_verification,
                 wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
 
                 float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result);
+
                 if(max_error > 8)
                 {
                     pass = false;
diff --git a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
index d0de7307d2..5ea35cd72f 100644
--- a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
@@ -42,7 +42,7 @@ template <int NDimSpatial,
 void profile_conv_fwd_bias_relu_add_impl(int do_verification,
                                          int init_method,
                                          bool do_log,
-                                         int nrepeat,
+                                         bool time_kernel,
                                          ck::index_t N,
                                          ck::index_t K,
                                          ck::index_t C,
@@ -219,7 +219,8 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
         {
             std::string conv_name = op_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
diff --git a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
index 9bdfa61283..f1c2fd300a 100644
--- a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
@@ -119,7 +119,7 @@ template <int NDimSpatial,
 void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
                                                 int init_method,
                                                 bool do_log,
-                                                int nrepeat,
+                                                bool time_kernel,
                                                 ck::index_t N,
                                                 ck::index_t K,
                                                 ck::index_t C,
@@ -275,7 +275,8 @@ void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
         {
             std::string conv_name = op_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
diff --git a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
index f34e52048e..eeb2b93e4e 100644
--- a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp
@@ -41,7 +41,7 @@ template <int NDimSpatial,
 void profile_conv_fwd_bias_relu_impl(int do_verification,
                                      int init_method,
                                      bool do_log,
-                                     int nrepeat,
+                                     bool time_kernel,
                                      ck::index_t N,
                                      ck::index_t K,
                                      ck::index_t C,
@@ -207,7 +207,8 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
         {
             std::string conv_name = op_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
 
diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp
index c9051f006f..291bf2abc0 100644
--- a/profiler/include/profile_convnd_bwd_data_impl.hpp
+++ b/profiler/include/profile_convnd_bwd_data_impl.hpp
@@ -1,7 +1,7 @@
 #pragma once
 #include "config.hpp"
 #include "device.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
 #include "tensor_layout.hpp"
@@ -269,7 +269,7 @@ template <int NDimSpatial,
 bool profile_convnd_bwd_data_impl(int do_verification,
                                   int init_method,
                                   bool do_log,
-                                  int nrepeat,
+                                  bool time_kernel,
                                   ck::index_t N,
                                   ck::index_t K,
                                   ck::index_t C,
@@ -410,7 +410,8 @@ bool profile_convnd_bwd_data_impl(int do_verification,
         {
             std::string conv_name = conv_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop =
                 ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
diff --git a/profiler/include/profile_gemm_bias_2d_impl.hpp b/profiler/include/profile_gemm_bias_2d_impl.hpp
index 98e4ad76c9..8565f9637c 100644
--- a/profiler/include/profile_gemm_bias_2d_impl.hpp
+++ b/profiler/include/profile_gemm_bias_2d_impl.hpp
@@ -65,7 +65,7 @@ template <typename ADataType,
 void profile_gemm_bias_2d_impl(int do_verification,
                                int init_method,
                                bool do_log,
-                               int nrepeat,
+                               bool time_kernel,
                                int M,
                                int N,
                                int K,
@@ -259,7 +259,8 @@ void profile_gemm_bias_2d_impl(int do_verification,
         {
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * M * N * K;
 
diff --git a/profiler/include/profile_gemm_bias_relu_add_impl.hpp b/profiler/include/profile_gemm_bias_relu_add_impl.hpp
index 75ed78075b..6fec17c199 100644
--- a/profiler/include/profile_gemm_bias_relu_add_impl.hpp
+++ b/profiler/include/profile_gemm_bias_relu_add_impl.hpp
@@ -48,7 +48,7 @@ template <typename ADataType,
 void profile_gemm_bias_relu_add_impl(int do_verification,
                                      int init_method,
                                      bool do_log,
-                                     int nrepeat,
+                                     bool time_kernel,
                                      int M,
                                      int N,
                                      int K,
@@ -232,7 +232,8 @@ void profile_gemm_bias_relu_add_impl(int do_verification,
         {
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * M * N * K;
 
diff --git a/profiler/include/profile_gemm_bias_relu_impl.hpp b/profiler/include/profile_gemm_bias_relu_impl.hpp
index 0735f3c31b..69010becc5 100644
--- a/profiler/include/profile_gemm_bias_relu_impl.hpp
+++ b/profiler/include/profile_gemm_bias_relu_impl.hpp
@@ -48,7 +48,7 @@ template <typename ADataType,
 void profile_gemm_bias_relu_impl(int do_verification,
                                  int init_method,
                                  bool do_log,
-                                 int nrepeat,
+                                 bool time_kernel,
                                  int M,
                                  int N,
                                  int K,
@@ -212,7 +212,8 @@ void profile_gemm_bias_relu_impl(int do_verification,
         {
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * M * N * K;
 
diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp
index 93262fe802..45e6174260 100644
--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
@@ -91,7 +91,7 @@ template <typename ADataType,
 void profile_gemm_impl(int do_verification,
                        int init_method,
                        bool do_log,
-                       int nrepeat,
+                       bool time_kernel,
                        int M,
                        int N,
                        int K,
@@ -416,7 +416,8 @@ void profile_gemm_impl(int do_verification,
 
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = std::size_t(2) * M * N * K;
 
diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp
index 6ef3e010b1..d034c9f750 100644
--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -52,7 +52,7 @@ template <typename ADataType,
 bool profile_gemm_reduce_impl(int do_verification,
                               int init_method,
                               bool do_log,
-                              int nrepeat,
+                              bool time_kernel,
                               int M,
                               int N,
                               int K,
@@ -243,36 +243,18 @@ bool profile_gemm_reduce_impl(int do_verification,
 
         if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
         {
-            // warm up
-            invoker_ptr->Run(argument_ptr.get());
+            // init DO, D1 to 0
+            d0_device_buf.SetZero();
+            d1_device_buf.SetZero();
 
-            // timing
-            float total_time = 0;
-
-            for(int i = 0; i < nrepeat; ++i)
-            {
-                // init DO, D1 to 0
-                d0_device_buf.SetZero();
-                d1_device_buf.SetZero();
-
-                KernelTimer timer;
-
-                timer.Start();
-
-                invoker_ptr->Run(argument_ptr.get());
-
-                timer.End();
-
-                total_time += timer.GetElapsedTime();
-            }
-
-            float ave_time = total_time / nrepeat;
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::string gemm_name = gemm_ptr->GetTypeString();
 
             std::size_t flop = std::size_t(2) * M * N * K;
 
-            std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
+            std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
                                     sizeof(CDataType) * M * N + sizeof(CDataType) * N;
 
             float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp
index ae70f551f1..96d34c7e42 100644
--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -49,7 +49,7 @@ template <typename ADataType,
 void profile_grouped_gemm_impl(int do_verification,
                                int init_method,
                                bool do_log,
-                               int nrepeat,
+                               bool time_kernel,
                                const std::vector<int>& Ms,
                                const std::vector<int>& Ns,
                                const std::vector<int>& Ks,
@@ -231,7 +231,8 @@ void profile_grouped_gemm_impl(int do_verification,
         {
             std::string gemm_name = gemm_ptr->GetTypeString();
 
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t flop = 0, num_btype = 0;
             for(std::size_t i = 0; i < gemm_shapes.size(); i++)
diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp
index 678134f60b..33c7929ddd 100644
--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -157,7 +157,7 @@ void profile_reduce_impl_impl(bool do_verification,
                               int init_method,
                               bool do_log,
                               bool do_dumpout,
-                              int nrepeat,
+                              bool time_kernel,
                               const std::vector<size_t>& inLengths,
                               const std::vector<int>& reduceDims,
                               float alpha,
@@ -430,7 +430,8 @@ void profile_reduce_impl_impl(bool do_verification,
 
             auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
 
-            float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t num_bytes =
                 invariant_total_length * reduce_total_length * sizeof(InDataType) +
@@ -516,7 +517,8 @@ void profile_reduce_impl_impl(bool do_verification,
 
             auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
 
-            float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
 
             std::size_t num_bytes =
                 invariant_total_length * reduce_total_length * sizeof(InDataType) +
@@ -554,7 +556,8 @@ void profile_reduce_impl_impl(bool do_verification,
 
                 auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
 
-                float avg_time_2 = invoker2_ptr->Run(argument2_ptr.get(), nrepeat);
+                float avg_time_2 =
+                    invoker2_ptr->Run(argument2_ptr.get(), StreamConfig{nullptr, time_kernel});
 
                 std::size_t num_bytes_2 =
                     static_cast<size_t>(inLengths2[0]) * inLengths2[1] * sizeof(AccDataType);
@@ -625,7 +628,7 @@ void profile_reduce_impl(bool do_verification,
                          int init_method,
                          bool do_log,
                          bool do_dumpout,
-                         int nrepeat,
+                         bool time_kernel,
                          const std::vector<size_t>& inLengths,
                          const std::vector<int>& reduceDims,
                          ReduceTensorOp ReduceOpId,
@@ -663,7 +666,7 @@ void profile_reduce_impl(bool do_verification,
             init_method,
             do_log,
             do_dumpout,
-            nrepeat,
+            time_kernel,
             inLengths,
             reduceDims,
             alpha,
diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp
index 2a806b0818..db5486e0ac 100644
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -48,8 +48,8 @@ int profile_batched_gemm(int argc, char* argv[])
         printf("                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
         exit(1);
     }
@@ -59,7 +59,7 @@ int profile_batched_gemm(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -82,7 +82,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -102,7 +102,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -122,7 +122,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -142,7 +142,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -162,7 +162,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -182,7 +182,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -202,7 +202,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -222,7 +222,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -242,7 +242,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -262,7 +262,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -282,7 +282,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -302,7 +302,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -322,7 +322,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -342,7 +342,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -362,7 +362,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -382,7 +382,7 @@ int profile_batched_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp
index 38c3f52193..f67e561865 100644
--- a/profiler/src/profile_batched_gemm_reduce.cpp
+++ b/profiler/src/profile_batched_gemm_reduce.cpp
@@ -33,8 +33,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
         printf("arg15: split k into  mulitiple batch\n");
         exit(1);
@@ -45,7 +45,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -69,7 +69,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -91,7 +91,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -113,7 +113,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -135,7 +135,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_conv_bwd_data.cpp b/profiler/src/profile_conv_bwd_data.cpp
index 2861af3d10..206d486ea0 100644
--- a/profiler/src/profile_conv_bwd_data.cpp
+++ b/profiler/src/profile_conv_bwd_data.cpp
@@ -44,7 +44,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
         printf("arg6: verification (0: no; 1: yes)\n");
         printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
         printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(1);
@@ -57,7 +57,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     const ck::index_t N  = std::stoi(argv[10]);
     const ck::index_t K  = std::stoi(argv[11]);
@@ -96,7 +96,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            StreamControl{nullptr, time_kernel},
             N,
             K,
             C,
@@ -122,7 +122,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            StreamControl{nullptr, time_kernel},
             N,
             K,
             C,
@@ -148,7 +148,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            StreamControl{nullptr, time_kernel},
             N,
             K,
             C,
@@ -174,7 +174,7 @@ int profile_conv_bwd_data(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            StreamControl{nullptr, time_kernel},
             N,
             K,
             C,
diff --git a/profiler/src/profile_conv_bwd_weight.cpp b/profiler/src/profile_conv_bwd_weight.cpp
index 309cc8ea2c..c022d19ee0 100644
--- a/profiler/src/profile_conv_bwd_weight.cpp
+++ b/profiler/src/profile_conv_bwd_weight.cpp
@@ -58,7 +58,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     const ck::index_t N  = std::stoi(argv[10]);
     const ck::index_t K  = std::stoi(argv[11]);
@@ -98,7 +98,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             N,
             K,
             C,
@@ -124,7 +124,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             N,
             K,
             C,
diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/profiler/src/profile_conv_fwd_bias_relu.cpp
index 1c447b483e..28aa49687f 100644
--- a/profiler/src/profile_conv_fwd_bias_relu.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu.cpp
@@ -42,7 +42,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
         printf("arg6: verification (0: no; 1: yes)\n");
         printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
         printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(1);
@@ -55,7 +55,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     const ck::index_t N  = std::stoi(argv[10]);
     const ck::index_t K  = std::stoi(argv[11]);
@@ -93,7 +93,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             N,
             K,
             C,
diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
index 522487c77b..7e033a51e2 100644
--- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
         printf("arg6: verification (0: no; 1: yes)\n");
         printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
         printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(1);
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     const ck::index_t N  = std::stoi(argv[10]);
     const ck::index_t K  = std::stoi(argv[11]);
@@ -94,7 +94,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             N,
             K,
             C,
diff --git a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
index 833f2851db..095536f701 100644
--- a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
+++ b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
         printf("arg6: verification (0: no; 1: yes)\n");
         printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
         printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(1);
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     const ck::index_t N  = std::stoi(argv[10]);
     const ck::index_t K  = std::stoi(argv[11]);
@@ -95,7 +95,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             N,
             K,
             C,
diff --git a/profiler/src/profile_convnd_bwd_data.cpp b/profiler/src/profile_convnd_bwd_data.cpp
index 893fb8c791..5d0e6a34c7 100644
--- a/profiler/src/profile_convnd_bwd_data.cpp
+++ b/profiler/src/profile_convnd_bwd_data.cpp
@@ -39,40 +39,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[],
     // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
     ck::utils::conv::ConvParams params;
 
-    params.num_dim_spatial = num_dim_spatial;
-    params.N               = std::stoi(argv[arg_idx++]);
-    params.K               = std::stoi(argv[arg_idx++]);
-    params.C               = std::stoi(argv[arg_idx++]);
+    params.num_dim_spatial_ = num_dim_spatial;
+    params.N_               = std::stoi(argv[arg_idx++]);
+    params.K_               = std::stoi(argv[arg_idx++]);
+    params.C_               = std::stoi(argv[arg_idx++]);
 
-    params.filter_spatial_lengths.resize(num_dim_spatial);
+    params.filter_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_spatial_lengths.resize(num_dim_spatial);
+    params.input_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_strides.resize(num_dim_spatial);
+    params.conv_filter_strides_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_dilations.resize(num_dim_spatial);
+    params.conv_filter_dilations_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_left_pads.resize(num_dim_spatial);
+    params.input_left_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_right_pads.resize(num_dim_spatial);
+    params.input_right_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
     }
 
     return params;
@@ -95,7 +95,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
         printf("arg6: verification (0: no; 1: yes)\n");
         printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
         printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
         printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         return 1;
@@ -108,7 +108,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
     const bool do_verification = std::stoi(argv[6]);
     const int init_method      = std::stoi(argv[7]);
     const bool do_log          = std::stoi(argv[8]);
-    const int nrepeat          = std::stoi(argv[9]);
+    const bool time_kernel     = std::stoi(argv[9]);
 
     ck::utils::conv::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams);
 
@@ -132,17 +132,17 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
                 do_verification,
                 init_method,
                 do_log,
-                nrepeat,
-                params.N,
-                params.K,
-                params.C,
-                params.input_spatial_lengths,
-                params.filter_spatial_lengths,
+                time_kernel,
+                params.N_,
+                params.K_,
+                params.C_,
+                params.input_spatial_lengths_,
+                params.filter_spatial_lengths_,
                 params.GetOutputSpatialLengths(),
-                params.conv_filter_strides,
-                params.conv_filter_dilations,
-                params.input_left_pads,
-                params.input_right_pads);
+                params.conv_filter_strides_,
+                params.conv_filter_dilations_,
+                params.input_left_pads_,
+                params.input_right_pads_);
             break;
 
         case 2:
@@ -157,17 +157,17 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
                 do_verification,
                 init_method,
                 do_log,
-                nrepeat,
-                params.N,
-                params.K,
-                params.C,
-                params.input_spatial_lengths,
-                params.filter_spatial_lengths,
+                time_kernel,
+                params.N_,
+                params.K_,
+                params.C_,
+                params.input_spatial_lengths_,
+                params.filter_spatial_lengths_,
                 params.GetOutputSpatialLengths(),
-                params.conv_filter_strides,
-                params.conv_filter_dilations,
-                params.input_left_pads,
-                params.input_right_pads);
+                params.conv_filter_strides_,
+                params.conv_filter_dilations_,
+                params.input_left_pads_,
+                params.input_right_pads_);
             break;
 
         case 3:
@@ -182,17 +182,17 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
                 do_verification,
                 init_method,
                 do_log,
-                nrepeat,
-                params.N,
-                params.K,
-                params.C,
-                params.input_spatial_lengths,
-                params.filter_spatial_lengths,
+                time_kernel,
+                params.N_,
+                params.K_,
+                params.C_,
+                params.input_spatial_lengths_,
+                params.filter_spatial_lengths_,
                 params.GetOutputSpatialLengths(),
-                params.conv_filter_strides,
-                params.conv_filter_dilations,
-                params.input_left_pads,
-                params.input_right_pads);
+                params.conv_filter_strides_,
+                params.conv_filter_dilations_,
+                params.input_left_pads_,
+                params.input_right_pads_);
             break;
 
         default: break;
diff --git a/profiler/src/profile_convnd_fwd.cpp b/profiler/src/profile_convnd_fwd.cpp
index 1abd73c729..722e86c2ea 100644
--- a/profiler/src/profile_convnd_fwd.cpp
+++ b/profiler/src/profile_convnd_fwd.cpp
@@ -5,7 +5,7 @@
 #include <vector>
 #include <half.hpp>
 
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "element_wise_operation.hpp"
 #include "fill.hpp"
 #include "profile_convnd_fwd.hpp"
@@ -119,7 +119,7 @@ template <int NDim,
 void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
                                    bool do_verification,
                                    bool do_log,
-                                   int nrepeat,
+                                   bool time_kernel,
                                    int init_method,
                                    ConvLayouts)
 {
@@ -185,7 +185,7 @@ void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
                                                                          reference_conv_fwd_fun);
     auto best_conf = run_engine.Profile(
         conv::ConvolutionFwdInstances<InDataType, WeiDataType, OutDataType>::template Get<NDim>(),
-        nrepeat,
+        time_kernel,
         do_verification,
         do_log);
 
@@ -201,7 +201,7 @@ void profile_convnd_instances(ConvDataType data_type,
                               const ck::utils::conv::ConvParams& params,
                               bool do_verification,
                               bool do_log,
-                              int nrepeat,
+                              bool time_kernel,
                               int init_method)
 {
     switch(data_layout)
@@ -214,7 +214,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
             break;
@@ -223,7 +223,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
             break;
@@ -232,7 +232,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
             break;
@@ -241,7 +241,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
             break;
@@ -256,7 +256,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
             break;
@@ -265,7 +265,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
             break;
@@ -274,7 +274,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
             break;
@@ -283,7 +283,7 @@ void profile_convnd_instances(ConvDataType data_type,
                 params,
                 do_verification,
                 do_log,
-                nrepeat,
+                time_kernel,
                 init_method,
                 ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
             break;
@@ -304,7 +304,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
     bool do_verification{true};
     int init_method{2};
     bool do_log{false};
-    int nrepeat{100};
+    bool time_kernel{false};
     int num_dim_spatial{2};
     ConvParams params;
 
@@ -318,7 +318,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
         do_verification = std::stoi(argv[4]);
         init_method     = std::stoi(argv[5]);
         do_log          = std::stoi(argv[6]);
-        nrepeat         = std::stoi(argv[7]);
+        time_kernel     = std::stoi(argv[7]);
         num_dim_spatial = std::stoi(argv[8]);
     }
     if(argc >= 10)
@@ -332,15 +332,15 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
     {
     case 1:
         profile_convnd_instances<1>(
-            data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+            data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
         break;
     case 2:
         profile_convnd_instances<2>(
-            data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+            data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
         break;
     case 3:
         profile_convnd_instances<3>(
-            data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
+            data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
         break;
     default:
         throw std::runtime_error("profile_conv_fwd: unsupported num_dim_spatial value: " +
diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp
index 7a72be2d8e..4c6a3b0487 100644
--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -38,8 +38,8 @@ int profile_gemm(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
         printf("arg14: split k into  mulitiple batch\n");
         exit(1);
@@ -50,7 +50,7 @@ int profile_gemm(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -74,7 +74,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -94,7 +94,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -114,7 +114,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -134,7 +134,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -154,7 +154,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -174,7 +174,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -194,7 +194,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -214,7 +214,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -234,7 +234,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -254,7 +254,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -274,7 +274,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -294,7 +294,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -314,7 +314,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -334,7 +334,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -354,7 +354,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -374,7 +374,7 @@ int profile_gemm(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_gemm_bias_2d.cpp b/profiler/src/profile_gemm_bias_2d.cpp
index dd7e418087..46d4f90c17 100644
--- a/profiler/src/profile_gemm_bias_2d.cpp
+++ b/profiler/src/profile_gemm_bias_2d.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
         printf("arg14: alpha\n");
         printf("arg15: beta\n");
@@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_gemm_bias_relu.cpp b/profiler/src/profile_gemm_bias_relu.cpp
index 67a47cf9ec..4346650c9f 100644
--- a/profiler/src/profile_gemm_bias_relu.cpp
+++ b/profiler/src/profile_gemm_bias_relu.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
         printf("arg14: split k into  mulitiple batch\n");
         exit(1);
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_gemm_bias_relu_add.cpp b/profiler/src/profile_gemm_bias_relu_add.cpp
index 52406e93d6..186f32cf6f 100644
--- a/profiler/src/profile_gemm_bias_relu_add.cpp
+++ b/profiler/src/profile_gemm_bias_relu_add.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n");
         printf("arg15: split k into  mulitiple batch\n");
         exit(1);
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp
index a83d4ce9a1..986acaf010 100644
--- a/profiler/src/profile_gemm_reduce.cpp
+++ b/profiler/src/profile_gemm_reduce.cpp
@@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
         printf("arg14: split k into  mulitiple batch\n");
         exit(1);
@@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const int M = std::stoi(argv[8]);
     const int N = std::stoi(argv[9]);
@@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
@@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[])
             do_verification,
             init_method,
             do_log,
-            nrepeat,
+            time_kernel,
             M,
             N,
             K,
diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp
index 88a2a8f855..d35484cfae 100644
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[])
         printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
         printf("arg4: verification (0: no; 1: yes)\n");
         printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
         printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
                "64,64 64,64 128,128)\n");
         exit(1);
@@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
     const bool do_verification = std::stoi(argv[4]);
     const int init_method      = std::stoi(argv[5]);
     const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
 
     const auto Ms = argToIntArray(argv[8]);
     const auto Ns = argToIntArray(argv[9]);
@@ -86,7 +86,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                 ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                    init_method,
                                                                                    do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                    Ms,
                                                                                    Ns,
                                                                                    Ks,
@@ -104,7 +104,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                 ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                    init_method,
                                                                                    do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                    Ms,
                                                                                    Ns,
                                                                                    Ks,
@@ -122,7 +122,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                 ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                    init_method,
                                                                                    do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                    Ms,
                                                                                    Ns,
                                                                                    Ks,
@@ -140,7 +140,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                 ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                    init_method,
                                                                                    do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                    Ms,
                                                                                    Ns,
                                                                                    Ks,
diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp
index 96fa78964a..5e91a1d2d1 100644
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
@@ -144,7 +144,7 @@ class AppArgs
     bool do_dumpout                = false;
 
     int init_method;
-    int nrepeat;
+    bool time_kernel;
 
     bool need_indices = false;
 
@@ -295,7 +295,7 @@ class AppArgs
             throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
 
         init_method = std::atoi(argv[optind++]);
-        nrepeat     = std::atoi(argv[optind]);
+        time_kernel = std::atoi(argv[optind]);
 
         if(scales.empty())
         {
@@ -354,7 +354,7 @@ int profile_reduce(int argc, char* argv[])
                                                                     args.init_method,
                                                                     args.do_log,
                                                                     args.do_dumpout,
-                                                                    args.nrepeat,
+                                                                    args.time_kernel,
                                                                     args.inLengths,
                                                                     args.reduceDims,
                                                                     args.reduceOp,
@@ -369,7 +369,7 @@ int profile_reduce(int argc, char* argv[])
                                                                args.init_method,
                                                                args.do_log,
                                                                args.do_dumpout,
-                                                               args.nrepeat,
+                                                               args.time_kernel,
                                                                args.inLengths,
                                                                args.reduceDims,
                                                                args.reduceOp,
@@ -387,7 +387,7 @@ int profile_reduce(int argc, char* argv[])
                                                     args.init_method,
                                                     args.do_log,
                                                     args.do_dumpout,
-                                                    args.nrepeat,
+                                                    args.time_kernel,
                                                     args.inLengths,
                                                     args.reduceDims,
                                                     args.reduceOp,
@@ -414,7 +414,7 @@ int profile_reduce(int argc, char* argv[])
                                                         args.init_method,
                                                         args.do_log,
                                                         args.do_dumpout,
-                                                        args.nrepeat,
+                                                        args.time_kernel,
                                                         args.inLengths,
                                                         args.reduceDims,
                                                         args.reduceOp,
@@ -429,7 +429,7 @@ int profile_reduce(int argc, char* argv[])
                                                          args.init_method,
                                                          args.do_log,
                                                          args.do_dumpout,
-                                                         args.nrepeat,
+                                                         args.time_kernel,
                                                          args.inLengths,
                                                          args.reduceDims,
                                                          args.reduceOp,
@@ -454,7 +454,7 @@ int profile_reduce(int argc, char* argv[])
                                                              args.init_method,
                                                              args.do_log,
                                                              args.do_dumpout,
-                                                             args.nrepeat,
+                                                             args.time_kernel,
                                                              args.inLengths,
                                                              args.reduceDims,
                                                              args.reduceOp,
@@ -471,7 +471,7 @@ int profile_reduce(int argc, char* argv[])
                                                      args.init_method,
                                                      args.do_log,
                                                      args.do_dumpout,
-                                                     args.nrepeat,
+                                                     args.time_kernel,
                                                      args.inLengths,
                                                      args.reduceDims,
                                                      args.reduceOp,
@@ -486,7 +486,7 @@ int profile_reduce(int argc, char* argv[])
                                                       args.init_method,
                                                       args.do_log,
                                                       args.do_dumpout,
-                                                      args.nrepeat,
+                                                      args.time_kernel,
                                                       args.inLengths,
                                                       args.reduceDims,
                                                       args.reduceOp,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 304ce070ff..2ad13da7b4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,5 @@
 include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/
     ${PROJECT_SOURCE_DIR}/include/ck
     ${PROJECT_SOURCE_DIR}/include/ck/utility
     ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
@@ -21,7 +22,8 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/external/include/half
 )
 
-add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+include(googletest)
+
 add_custom_target(tests)
 
 
@@ -41,7 +43,7 @@ function(add_gtest_executable TEST_NAME)
     add_dependencies(tests ${TEST_NAME})
     add_dependencies(check ${TEST_NAME})
     # suppress gtest warnings
-    target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors)
+    target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef)
     target_link_libraries(${TEST_NAME} PRIVATE gtest_main)
     gtest_discover_tests(${TEST_NAME})
 endfunction(add_gtest_executable TEST_NAME)
@@ -60,4 +62,6 @@ add_subdirectory(grouped_gemm)
 add_subdirectory(convnd_fwd)
 add_subdirectory(reduce)
 add_subdirectory(conv2d_bwd_weight)
+add_subdirectory(convnd_bwd_data)
 add_subdirectory(cgemm)
+# DONOT add client_app, that is tested via CI independently
diff --git a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
index ce061c644b..7b311cff17 100644
--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -22,7 +22,7 @@ int main()
                                                                   Row,
                                                                   Row,
                                                                   Row>(
-                       true, 1, false, 1, M, N, K, K, N, N, BatchCount);
+                       true, 1, false, false, M, N, K, K, N, N, BatchCount);
 
     pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
                                                                   ck::half_t,
@@ -31,7 +31,7 @@ int main()
                                                                   Row,
                                                                   Col,
                                                                   Row>(
-                       true, 1, false, 1, M, N, K, K, K, N, BatchCount);
+                       true, 1, false, false, M, N, K, K, K, N, BatchCount);
 
     pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
                                                                   ck::half_t,
@@ -40,7 +40,7 @@ int main()
                                                                   Col,
                                                                   Row,
                                                                   Row>(
-                       true, 1, false, 1, M, N, K, M, N, N, BatchCount);
+                       true, 1, false, false, M, N, K, M, N, N, BatchCount);
 
     pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
                                                                   ck::half_t,
@@ -49,7 +49,7 @@ int main()
                                                                   Col,
                                                                   Col,
                                                                   Row>(
-                       true, 1, false, 1, M, N, K, M, K, N, BatchCount);
+                       true, 1, false, false, M, N, K, M, K, N, BatchCount);
 
     if(pass)
     {
diff --git a/test/client_app/CMakeLists.txt b/test/client_app/CMakeLists.txt
new file mode 100644
index 0000000000..f8dd8c4e0a
--- /dev/null
+++ b/test/client_app/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.15)
+project(ck_app)
+add_compile_options(-std=c++14)
+
+find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor)
+find_package(hip REQUIRED PATHS /opt/rocm)
+message(STATUS "Build with HIP ${hip_VERSION}")
+
+add_executable(test_client_app client_app.cpp)
+
+target_link_libraries(test_client_app PRIVATE composable_kernel::device_operations composable_kernel::host_tensor hip::host)
diff --git a/test/client_app/client_app.cpp b/test/client_app/client_app.cpp
new file mode 100644
index 0000000000..665a103f70
--- /dev/null
+++ b/test/client_app/client_app.cpp
@@ -0,0 +1,77 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include <vector>
+
+#include "client_app_impl.hpp"
+
+int main(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    const ConvDataType data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout          = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout         = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout         = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification   = std::stoi(argv[6]);
+    const int init_method        = std::stoi(argv[7]);
+    const bool do_log            = std::stoi(argv[8]);
+    const bool time_kernel       = std::stoi(argv[9]);
+
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    ck::app::profile_conv_fwd_impl(do_verification,
+                                   init_method,
+                                   do_log,
+                                   time_kernel,
+                                   data_type,
+                                   N,
+                                   K,
+                                   C,
+                                   std::vector<ck::index_t>{Hi, Wi},
+                                   std::vector<ck::index_t>{Y, X},
+                                   std::vector<ck::index_t>{Ho, Wo},
+                                   std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+                                   std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+                                   std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+                                   std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    return 1;
+}
diff --git a/test/client_app/client_app_impl.hpp b/test/client_app/client_app_impl.hpp
new file mode 100644
index 0000000000..f9e4145ba0
--- /dev/null
+++ b/test/client_app/client_app_impl.hpp
@@ -0,0 +1,214 @@
+#pragma once
+
+#include "host_interface.hpp"
+
+enum ConvDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+
+void check_hip_error(void)
+{
+    hipError_t err = hipGetLastError();
+    if(err != hipSuccess)
+    {
+        std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
+        exit(err);
+    }
+}
+std::string getDeviceName(int device)
+{
+    struct hipDeviceProp_t prop;
+    hipGetDeviceProperties(&prop, device);
+    check_hip_error();
+    return std::string(prop.name);
+}
+
+int getDriver(void)
+{
+    int driver;
+    hipDriverGetVersion(&driver);
+    check_hip_error();
+    return driver;
+}
+
+namespace ck {
+namespace app {
+struct DeviceMem
+{
+    DeviceMem() = delete;
+    DeviceMem(std::size_t mem_size);
+    void* GetDeviceBuffer();
+    void ToDevice(const void* p);
+    void FromDevice(void* p);
+    ~DeviceMem();
+
+    void* mpDeviceBuf;
+    std::size_t mMemSize;
+};
+
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+{
+    hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+}
+
+void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
+
+void DeviceMem::ToDevice(const void* p)
+{
+    hipGetErrorString(
+        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+}
+
+void DeviceMem::FromDevice(void* p)
+{
+    hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+}
+
+DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+
+void profile_conv_fwd_impl(int do_verification,
+                           int init_method,
+                           bool do_log,
+                           bool time_kernel,
+                           ConvDataType data_type,
+                           ck::index_t N,
+                           ck::index_t K,
+                           ck::index_t C,
+                           std::vector<ck::index_t> input_spatial_lengths,
+                           std::vector<ck::index_t> filter_spatial_lengths,
+                           std::vector<ck::index_t> output_spatial_lengths,
+                           std::vector<ck::index_t> conv_filter_strides,
+                           std::vector<ck::index_t> conv_filter_dilations,
+                           std::vector<ck::index_t> input_left_pads,
+                           std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+
+    const auto in_sz  = N * C * Hi * Wi;
+    const auto wei_sz = K * C * Y * X;
+    const auto out_sz = N * K * Ho * Wo;
+
+    using WeiDataType = float;
+    using InDataType  = float;
+    using OutDataType = float;
+
+    app::DeviceMem in_device_buf(sizeof(InDataType) * in_sz);
+    app::DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_sz);
+    app::DeviceMem out_device_buf(sizeof(OutDataType) * out_sz);
+    // data is already on device!
+
+    // add device Conv instances
+    std::vector<DeviceConvFwdPtr_t> conv_ptrs;
+    if(data_type == F16_F16_F16)
+    {
+        add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
+    }
+    else if(data_type == BF16_BF16_BF16)
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(conv_ptrs);
+    else if(data_type == F32_F32_F32)
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(conv_ptrs);
+    else if(data_type == INT8_INT8_INT8)
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs);
+    else
+        throw std::runtime_error("wrong! Invalid data type");
+    if(conv_ptrs.empty())
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    int deviceIndex       = 0;
+    hipSetDevice(deviceIndex);
+    check_hip_error();
+
+    StreamConfig stream_config{nullptr, time_kernel};
+    hipStreamCreate(&stream_config.stream_id_);
+    check_hip_error();
+
+    // profile device Conv instances
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto argument_ptr =
+            conv_ptr.MakeArgumentPointer(static_cast<void*>(in_device_buf.GetDeviceBuffer()),
+                                         static_cast<void*>(wei_device_buf.GetDeviceBuffer()),
+                                         static_cast<void*>(out_device_buf.GetDeviceBuffer()),
+                                         N,
+                                         K,
+                                         C,
+                                         input_spatial_lengths,
+                                         filter_spatial_lengths,
+                                         output_spatial_lengths,
+                                         conv_filter_strides,
+                                         conv_filter_dilations,
+                                         input_left_pads,
+                                         input_right_pads);
+
+        auto invoker_ptr = conv_ptr.MakeInvokerPointer();
+
+        if(conv_ptr.IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = conv_ptr.GetTypeString();
+            float ave_time        = invoker_ptr->Run(argument_ptr.get(), stream_config);
+
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+
+            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                                    sizeof(WeiDataType) * (K * C * Y * X) +
+                                    sizeof(OutDataType) * (N * K * Ho * Wo);
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_conv_name  = conv_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+
+} // namespace app
+} // namespace ck
diff --git a/test/conv2d_bwd_weight/CMakeLists.txt b/test/conv2d_bwd_weight/CMakeLists.txt
index 7b515b6b8e..ecd5336c1f 100644
--- a/test/conv2d_bwd_weight/CMakeLists.txt
+++ b/test/conv2d_bwd_weight/CMakeLists.txt
@@ -4,4 +4,4 @@ include_directories(BEFORE
 )
 
 add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
-target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_fwd_util)
+target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util)
diff --git a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
index bb3ed985e3..671980f49e 100644
--- a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
+++ b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
@@ -6,7 +6,7 @@
 #include <half.hpp>
 #include <vector>
 
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "profile_conv_bwd_weight_impl.hpp"
 
 int test_self()
@@ -28,20 +28,20 @@ int test_self()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads,
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_,
             2);
 
         // fp16
@@ -52,28 +52,28 @@ int test_self()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads,
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_,
             2);
     }
     return pass;
 }
 int main(int argc, char* argv[])
 {
-    int data_type   = 0;
-    int init_method = 0;
+    int data_type   = 1;
+    int init_method = 1;
 
     // Conv shape
     ck::index_t N               = 128;
@@ -155,20 +155,20 @@ int main(int argc, char* argv[])
                                                               ck::tensor_layout::convolution::NHWC,
                                                               ck::tensor_layout::convolution::KYXC,
                                                               ck::tensor_layout::convolution::NHWK>(
-                1,
+                true, // do_verification
                 init_method,
-                0,
-                1,
-                param.N,
-                param.K,
-                param.C,
-                param.input_spatial_lengths,
-                param.filter_spatial_lengths,
+                false, // do_log
+                false, // time_kernel
+                param.N_,
+                param.K_,
+                param.C_,
+                param.input_spatial_lengths_,
+                param.filter_spatial_lengths_,
                 param.GetOutputSpatialLengths(),
-                param.conv_filter_strides,
-                param.conv_filter_dilations,
-                param.input_left_pads,
-                param.input_right_pads,
+                param.conv_filter_strides_,
+                param.conv_filter_dilations_,
+                param.input_left_pads_,
+                param.input_right_pads_,
                 split_k);
         }
         else if(data_type == 1)
@@ -180,20 +180,20 @@ int main(int argc, char* argv[])
                                                               ck::tensor_layout::convolution::NHWC,
                                                               ck::tensor_layout::convolution::KYXC,
                                                               ck::tensor_layout::convolution::NHWK>(
-                1,
+                true, // do_verification
                 init_method,
-                0,
-                1,
-                param.N,
-                param.K,
-                param.C,
-                param.input_spatial_lengths,
-                param.filter_spatial_lengths,
+                false, // do_log
+                false, // time_kernel
+                param.N_,
+                param.K_,
+                param.C_,
+                param.input_spatial_lengths_,
+                param.filter_spatial_lengths_,
                 param.GetOutputSpatialLengths(),
-                param.conv_filter_strides,
-                param.conv_filter_dilations,
-                param.input_left_pads,
-                param.input_right_pads,
+                param.conv_filter_strides_,
+                param.conv_filter_dilations_,
+                param.input_left_pads_,
+                param.input_right_pads_,
                 split_k);
         }
         else
diff --git a/test/conv_util/CMakeLists.txt b/test/conv_util/CMakeLists.txt
index 70b3e851be..795c9ec0ac 100644
--- a/test/conv_util/CMakeLists.txt
+++ b/test/conv_util/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_gtest_executable(test_conv_util conv_util.cpp)
-target_link_libraries(test_conv_util PRIVATE host_tensor conv_fwd_util)
+target_link_libraries(test_conv_util PRIVATE host_tensor conv_util)
diff --git a/test/conv_util/conv_util.cpp b/test/conv_util/conv_util.cpp
index 453225e800..98f55b872e 100644
--- a/test/conv_util/conv_util.cpp
+++ b/test/conv_util/conv_util.cpp
@@ -1,10 +1,10 @@
 #include <iostream>
 #include <string>
 #include <vector>
-#include "gtest/gtest.h"
+#include <gtest/gtest.h>
 
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "tensor_layout.hpp"
 #include "check_err.hpp"
 
@@ -15,13 +15,13 @@ class TestConvUtil : public ::testing::Test
     public:
     void SetNDParams(std::size_t ndims)
     {
-        conv_params.num_dim_spatial        = ndims;
-        conv_params.filter_spatial_lengths = std::vector<ck::index_t>(ndims, 3);
-        conv_params.input_spatial_lengths  = std::vector<ck::index_t>(ndims, 71);
-        conv_params.conv_filter_strides    = std::vector<ck::index_t>(ndims, 2);
-        conv_params.conv_filter_dilations  = std::vector<ck::index_t>(ndims, 1);
-        conv_params.input_left_pads        = std::vector<ck::index_t>(ndims, 1);
-        conv_params.input_right_pads       = std::vector<ck::index_t>(ndims, 1);
+        conv_params.num_dim_spatial_        = ndims;
+        conv_params.filter_spatial_lengths_ = std::vector<ck::index_t>(ndims, 3);
+        conv_params.input_spatial_lengths_  = std::vector<ck::index_t>(ndims, 71);
+        conv_params.conv_filter_strides_    = std::vector<ck::index_t>(ndims, 2);
+        conv_params.conv_filter_dilations_  = std::vector<ck::index_t>(ndims, 1);
+        conv_params.input_left_pads_        = std::vector<ck::index_t>(ndims, 1);
+        conv_params.input_right_pads_       = std::vector<ck::index_t>(ndims, 1);
     }
 
     protected:
@@ -44,29 +44,29 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths2D)
                                      std::vector<ck::index_t>{36, 36},
                                      "Error: ConvParams 2D default constructor."));
 
-    conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
-    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_strides_ = std::vector<ck::index_t>{1, 1};
+    out_spatial_len                  = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(ck::utils::check_err(
         out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}."));
 
-    conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2};
-    conv_params.input_left_pads     = std::vector<ck::index_t>{2, 2};
-    conv_params.input_right_pads    = std::vector<ck::index_t>{2, 2};
-    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_strides_ = std::vector<ck::index_t>{2, 2};
+    conv_params.input_left_pads_     = std::vector<ck::index_t>{2, 2};
+    conv_params.input_right_pads_    = std::vector<ck::index_t>{2, 2};
+    out_spatial_len                  = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
                                      std::vector<ck::index_t>{37, 37},
                                      "Error: ConvParams 2D padding left/right {2,2}."));
 
-    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
-    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_dilations_ = std::vector<ck::index_t>{2, 2};
+    out_spatial_len                    = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(ck::utils::check_err(
         out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}."));
 
-    conv_params.conv_filter_strides   = std::vector<ck::index_t>{3, 3};
-    conv_params.input_left_pads       = std::vector<ck::index_t>{1, 1};
-    conv_params.input_right_pads      = std::vector<ck::index_t>{1, 1};
-    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
-    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_strides_   = std::vector<ck::index_t>{3, 3};
+    conv_params.input_left_pads_       = std::vector<ck::index_t>{1, 1};
+    conv_params.input_right_pads_      = std::vector<ck::index_t>{1, 1};
+    conv_params.conv_filter_dilations_ = std::vector<ck::index_t>{2, 2};
+    out_spatial_len                    = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(
         ck::utils::check_err(out_spatial_len,
                              std::vector<ck::index_t>{23, 23},
@@ -81,29 +81,29 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths1D)
     EXPECT_TRUE(ck::utils::check_err(
         out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D."));
 
-    conv_params.conv_filter_strides = std::vector<ck::index_t>{1};
-    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_strides_ = std::vector<ck::index_t>{1};
+    out_spatial_len                  = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(ck::utils::check_err(
         out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}."));
 
-    conv_params.conv_filter_strides = std::vector<ck::index_t>{2};
-    conv_params.input_left_pads     = std::vector<ck::index_t>{2};
-    conv_params.input_right_pads    = std::vector<ck::index_t>{2};
-    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_strides_ = std::vector<ck::index_t>{2};
+    conv_params.input_left_pads_     = std::vector<ck::index_t>{2};
+    conv_params.input_right_pads_    = std::vector<ck::index_t>{2};
+    out_spatial_len                  = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
                                      std::vector<ck::index_t>{37},
                                      "Error: ConvParams 1D padding left/right {2}."));
 
-    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
-    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_dilations_ = std::vector<ck::index_t>{2};
+    out_spatial_len                    = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(ck::utils::check_err(
         out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}."));
 
-    conv_params.conv_filter_strides   = std::vector<ck::index_t>{3};
-    conv_params.input_left_pads       = std::vector<ck::index_t>{1};
-    conv_params.input_right_pads      = std::vector<ck::index_t>{1};
-    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
-    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_strides_   = std::vector<ck::index_t>{3};
+    conv_params.input_left_pads_       = std::vector<ck::index_t>{1};
+    conv_params.input_right_pads_      = std::vector<ck::index_t>{1};
+    conv_params.conv_filter_dilations_ = std::vector<ck::index_t>{2};
+    out_spatial_len                    = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(
         ck::utils::check_err(out_spatial_len,
                              std::vector<ck::index_t>{23},
@@ -118,31 +118,31 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths3D)
     EXPECT_TRUE(ck::utils::check_err(
         out_spatial_len, std::vector<ck::index_t>{36, 36, 36}, "Error: ConvParams 3D."));
 
-    conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1, 1};
-    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_strides_ = std::vector<ck::index_t>{1, 1, 1};
+    out_spatial_len                  = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
                                      std::vector<ck::index_t>{71, 71, 71},
                                      "Error: ConvParams 3D stride {1, 1, 1}."));
 
-    conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2, 2};
-    conv_params.input_left_pads     = std::vector<ck::index_t>{2, 2, 2};
-    conv_params.input_right_pads    = std::vector<ck::index_t>{2, 2, 2};
-    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_strides_ = std::vector<ck::index_t>{2, 2, 2};
+    conv_params.input_left_pads_     = std::vector<ck::index_t>{2, 2, 2};
+    conv_params.input_right_pads_    = std::vector<ck::index_t>{2, 2, 2};
+    out_spatial_len                  = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
                                      std::vector<ck::index_t>{37, 37, 37},
                                      "Error: ConvParams 3D padding left/right {2, 2, 2}."));
 
-    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
-    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_dilations_ = std::vector<ck::index_t>{2, 2, 2};
+    out_spatial_len                    = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
                                      std::vector<ck::index_t>{36, 36, 36},
                                      "Error: ConvParams 3D dilation {2, 2, 2}."));
 
-    conv_params.conv_filter_strides   = std::vector<ck::index_t>{3, 3, 3};
-    conv_params.input_left_pads       = std::vector<ck::index_t>{1, 1, 1};
-    conv_params.input_right_pads      = std::vector<ck::index_t>{1, 1, 1};
-    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
-    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    conv_params.conv_filter_strides_   = std::vector<ck::index_t>{3, 3, 3};
+    conv_params.input_left_pads_       = std::vector<ck::index_t>{1, 1, 1};
+    conv_params.input_right_pads_      = std::vector<ck::index_t>{1, 1, 1};
+    conv_params.conv_filter_dilations_ = std::vector<ck::index_t>{2, 2, 2};
+    out_spatial_len                    = conv_params.GetOutputSpatialLengths();
     EXPECT_TRUE(ck::utils::check_err(
         out_spatial_len,
         std::vector<ck::index_t>{23, 23, 23},
diff --git a/test/convnd_bwd_data/CMakeLists.txt b/test/convnd_bwd_data/CMakeLists.txt
index 58e6e7d3d0..55d71a41d3 100644
--- a/test/convnd_bwd_data/CMakeLists.txt
+++ b/test/convnd_bwd_data/CMakeLists.txt
@@ -4,4 +4,4 @@ include_directories(BEFORE
 )
 
 add_test_executable(test_convnd_bwd_data convnd_bwd_data.cpp)
-target_link_libraries(test_convnd_bwd_data PRIVATE host_tensor device_convnd_bwd_data_instance conv_fwd_util)
+target_link_libraries(test_convnd_bwd_data PRIVATE host_tensor device_convnd_bwd_data_instance conv_util)
diff --git a/test/convnd_bwd_data/convnd_bwd_data.cpp b/test/convnd_bwd_data/convnd_bwd_data.cpp
index cbc215033b..7284680e0e 100644
--- a/test/convnd_bwd_data/convnd_bwd_data.cpp
+++ b/test/convnd_bwd_data/convnd_bwd_data.cpp
@@ -27,20 +27,20 @@ int main()
                                                            ck::tensor_layout::convolution::NWC,
                                                            ck::tensor_layout::convolution::KXC,
                                                            ck::tensor_layout::convolution::NWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
 
         pass &= ck::profiler::profile_convnd_bwd_data_impl<1,
                                                            ck::half_t,
@@ -50,20 +50,20 @@ int main()
                                                            ck::tensor_layout::convolution::NWC,
                                                            ck::tensor_layout::convolution::KXC,
                                                            ck::tensor_layout::convolution::NWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
 
         pass &= ck::profiler::profile_convnd_bwd_data_impl<1,
                                                            ck::bhalf_t,
@@ -73,20 +73,20 @@ int main()
                                                            ck::tensor_layout::convolution::NWC,
                                                            ck::tensor_layout::convolution::KXC,
                                                            ck::tensor_layout::convolution::NWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
 
         pass &= ck::profiler::profile_convnd_bwd_data_impl<1,
                                                            int8_t,
@@ -96,20 +96,20 @@ int main()
                                                            ck::tensor_layout::convolution::NWC,
                                                            ck::tensor_layout::convolution::KXC,
                                                            ck::tensor_layout::convolution::NWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
     }
 
     // check 2d
@@ -128,20 +128,20 @@ int main()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
 
         pass &= ck::profiler::profile_convnd_bwd_data_impl<2,
                                                            ck::half_t,
@@ -151,20 +151,20 @@ int main()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
 
         pass &= ck::profiler::profile_convnd_bwd_data_impl<2,
                                                            ck::bhalf_t,
@@ -174,20 +174,20 @@ int main()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
 
         pass &= ck::profiler::profile_convnd_bwd_data_impl<2,
                                                            int8_t,
@@ -197,20 +197,20 @@ int main()
                                                            ck::tensor_layout::convolution::NHWC,
                                                            ck::tensor_layout::convolution::KYXC,
                                                            ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
     }
 
     // check 3d
@@ -232,20 +232,20 @@ int main()
                                                            ck::tensor_layout::convolution::NDHWC,
                                                            ck::tensor_layout::convolution::KZYXC,
                                                            ck::tensor_layout::convolution::NDHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
 
         pass &= ck::profiler::profile_convnd_bwd_data_impl<3,
                                                            ck::half_t,
@@ -255,20 +255,20 @@ int main()
                                                            ck::tensor_layout::convolution::NDHWC,
                                                            ck::tensor_layout::convolution::KZYXC,
                                                            ck::tensor_layout::convolution::NDHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
 
         pass &= ck::profiler::profile_convnd_bwd_data_impl<3,
                                                            ck::bhalf_t,
@@ -278,20 +278,20 @@ int main()
                                                            ck::tensor_layout::convolution::NDHWC,
                                                            ck::tensor_layout::convolution::KZYXC,
                                                            ck::tensor_layout::convolution::NDHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
 
         pass &= ck::profiler::profile_convnd_bwd_data_impl<3,
                                                            int8_t,
@@ -301,20 +301,20 @@ int main()
                                                            ck::tensor_layout::convolution::NDHWC,
                                                            ck::tensor_layout::convolution::KZYXC,
                                                            ck::tensor_layout::convolution::NDHWK>(
-            1, // do_verification,
-            1, // init_method,
-            0, // do_log,
-            1, // nrepeat,
-            param.N,
-            param.K,
-            param.C,
-            param.input_spatial_lengths,
-            param.filter_spatial_lengths,
+            true,  // do_verification
+            1,     // init_method
+            false, // do_log
+            false, // time_kernel
+            param.N_,
+            param.K_,
+            param.C_,
+            param.input_spatial_lengths_,
+            param.filter_spatial_lengths_,
             param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
-            param.conv_filter_dilations,
-            param.input_left_pads,
-            param.input_right_pads);
+            param.conv_filter_strides_,
+            param.conv_filter_dilations_,
+            param.input_left_pads_,
+            param.input_right_pads_);
     }
 
     if(pass)
diff --git a/test/convnd_fwd/CMakeLists.txt b/test/convnd_fwd/CMakeLists.txt
index 1d2ae3e4e3..34e698681b 100644
--- a/test/convnd_fwd/CMakeLists.txt
+++ b/test/convnd_fwd/CMakeLists.txt
@@ -1,13 +1,13 @@
 add_custom_target(test_convnd_fwd)
 
 add_gtest_executable(test_conv1d_fwd conv1d_fwd.cpp)
-target_link_libraries(test_conv1d_fwd PRIVATE host_tensor device_conv1d_fwd_instance conv_fwd_util)
+target_link_libraries(test_conv1d_fwd PRIVATE host_tensor device_conv1d_fwd_instance conv_util)
 add_dependencies(test_convnd_fwd test_conv1d_fwd)
 
 add_gtest_executable(test_conv2d_fwd conv2d_fwd.cpp)
-target_link_libraries(test_conv2d_fwd PRIVATE host_tensor device_conv2d_fwd_instance conv_fwd_util)
+target_link_libraries(test_conv2d_fwd PRIVATE host_tensor device_conv2d_fwd_instance conv_util)
 add_dependencies(test_convnd_fwd test_conv2d_fwd)
 
 add_gtest_executable(test_conv3d_fwd conv3d_fwd.cpp)
-target_link_libraries(test_conv3d_fwd PRIVATE host_tensor device_conv3d_fwd_instance conv_fwd_util)
+target_link_libraries(test_conv3d_fwd PRIVATE host_tensor device_conv3d_fwd_instance conv_util)
 add_dependencies(test_convnd_fwd test_conv3d_fwd)
diff --git a/test/convnd_fwd/conv1d_fwd.cpp b/test/convnd_fwd/conv1d_fwd.cpp
index c161b2795e..b6b6a89b2c 100644
--- a/test/convnd_fwd/conv1d_fwd.cpp
+++ b/test/convnd_fwd/conv1d_fwd.cpp
@@ -6,7 +6,7 @@
 
 #include "data_type.hpp"
 #include "element_wise_operation.hpp"
-#include "conv_fwd_util.hpp"
+#include "library/include/ck/library/utility/conv_util.hpp"
 #include "conv_util.hpp"
 
 namespace {
@@ -19,13 +19,13 @@ bool test_conv1d_nwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpPt
     namespace ctl = ck::tensor_layout::convolution;
 
     ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 1;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{71};
-    params.conv_filter_strides    = std::vector<ck::index_t>{2};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
-    params.input_left_pads        = std::vector<ck::index_t>{1};
-    params.input_right_pads       = std::vector<ck::index_t>{1};
+    params.num_dim_spatial_        = 1;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{71};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{2};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1};
 
     conv::ConvFwdOpInstance<T, T, T, ctl::NWC, ctl::KCX, ctl::NWK> conv_instance(params);
 
@@ -44,16 +44,16 @@ TEST(Conv1DFwdNWC, TestConv1D)
     namespace ctl = ck::tensor_layout::convolution;
 
     ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 1;
-    params.N                      = 2;
-    params.K                      = 16;
-    params.C                      = 4;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{16};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
-    params.input_left_pads        = std::vector<ck::index_t>{1};
-    params.input_right_pads       = std::vector<ck::index_t>{1};
+    params.num_dim_spatial_        = 1;
+    params.N_                      = 2;
+    params.K_                      = 16;
+    params.C_                      = 4;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{16};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1};
 
     std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
     test::conv::get_test_convolution_fwd_instance<1>(conv_ptrs);
diff --git a/test/convnd_fwd/conv2d_fwd.cpp b/test/convnd_fwd/conv2d_fwd.cpp
index e3815f778a..05e46147be 100644
--- a/test/convnd_fwd/conv2d_fwd.cpp
+++ b/test/convnd_fwd/conv2d_fwd.cpp
@@ -6,7 +6,7 @@
 
 #include "data_type.hpp"
 #include "element_wise_operation.hpp"
-#include "conv_fwd_util.hpp"
+#include "ck/library/utility/conv_util.hpp"
 #include "conv_util.hpp"
 
 namespace {
@@ -18,13 +18,13 @@ bool test_conv2d_nhwc_instances(const std::vector<test::conv::DeviceConvFwdNoOpP
     using namespace ck::utils;
 
     conv::ConvParams params;
-    params.num_dim_spatial        = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{71, 71};
-    params.conv_filter_strides    = std::vector<ck::index_t>{2, 2};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{1, 1};
-    params.input_right_pads       = std::vector<ck::index_t>{1, 1};
+    params.num_dim_spatial_        = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{71, 71};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{2, 2};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1, 1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1, 1};
 
     conv::ConvFwdOpInstance<T, T, T> conv_instance(params);
 
@@ -42,11 +42,11 @@ TEST(Conv2DFwdNHWC, TestConv2D)
     using namespace ck::utils;
 
     ck::utils::conv::ConvParams params;
-    params.N                     = 2;
-    params.K                     = 16;
-    params.C                     = 4;
-    params.input_spatial_lengths = std::vector<ck::index_t>{16, 16};
-    params.conv_filter_strides   = std::vector<ck::index_t>{1, 1};
+    params.N_                     = 2;
+    params.K_                     = 16;
+    params.C_                     = 4;
+    params.input_spatial_lengths_ = std::vector<ck::index_t>{16, 16};
+    params.conv_filter_strides_   = std::vector<ck::index_t>{1, 1};
 
     std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
     test::conv::get_test_convolution_fwd_instance<2>(conv_ptrs);
diff --git a/test/convnd_fwd/conv3d_fwd.cpp b/test/convnd_fwd/conv3d_fwd.cpp
index fc3da3e9c7..c6f0e7ec07 100644
--- a/test/convnd_fwd/conv3d_fwd.cpp
+++ b/test/convnd_fwd/conv3d_fwd.cpp
@@ -7,7 +7,7 @@
 
 #include "data_type.hpp"
 #include "element_wise_operation.hpp"
-#include "conv_fwd_util.hpp"
+#include "library/include/ck/library/utility/conv_util.hpp"
 #include "conv_util.hpp"
 
 namespace {
@@ -20,14 +20,14 @@ bool test_conv3d_ndhwc_instances(const std::vector<test::conv::DeviceConvFwdNoOp
     namespace ctl = ck::tensor_layout::convolution;
 
     conv::ConvParams params;
-    params.N                      = 64;
-    params.num_dim_spatial        = 3;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 2};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{32, 32, 2};
-    params.conv_filter_strides    = std::vector<ck::index_t>{2, 2, 2};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{1, 1, 1};
-    params.input_right_pads       = std::vector<ck::index_t>{1, 1, 1};
+    params.N_                      = 64;
+    params.num_dim_spatial_        = 3;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3, 2};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{32, 32, 2};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{2, 2, 2};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1, 1, 1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1, 1, 1};
 
     conv::ConvFwdOpInstance<T, T, T, ctl::NDHWC, ctl::KZYXC, ctl::NDHWK> conv_instance(params);
 
@@ -46,16 +46,16 @@ TEST(Conv3DFwdNDHWC, TestConv3D)
     namespace ctl = ck::tensor_layout::convolution;
 
     conv::ConvParams params;
-    params.num_dim_spatial        = 3;
-    params.N                      = 2;
-    params.K                      = 16;
-    params.C                      = 4;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{16, 16, 16};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1, 1, 1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{1, 1, 1};
-    params.input_right_pads       = std::vector<ck::index_t>{1, 1, 1};
+    params.num_dim_spatial_        = 3;
+    params.N_                      = 2;
+    params.K_                      = 16;
+    params.C_                      = 4;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3, 3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{16, 16, 16};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1, 1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1, 1, 1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1, 1, 1};
 
     std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
     test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
@@ -77,16 +77,16 @@ TEST(Conv3DFwdNDHWC, InputOver2GB)
 
     // >2GB Input
     conv::ConvParams params;
-    params.num_dim_spatial        = 3;
-    params.N                      = 2;
-    params.K                      = 16;
-    params.C                      = 32;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{32, 1000, 1000};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1, 1, 1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{1, 1, 1};
-    params.input_right_pads       = std::vector<ck::index_t>{1, 1, 1};
+    params.num_dim_spatial_        = 3;
+    params.N_                      = 2;
+    params.K_                      = 16;
+    params.C_                      = 32;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3, 3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{32, 1000, 1000};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1, 1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1, 1, 1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1, 1, 1};
 
     std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
     test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
@@ -94,16 +94,16 @@ TEST(Conv3DFwdNDHWC, InputOver2GB)
     auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr,
                                                      nullptr,
                                                      nullptr,
-                                                     params.N,
-                                                     params.K,
-                                                     params.C,
-                                                     params.input_spatial_lengths,
-                                                     params.filter_spatial_lengths,
+                                                     params.N_,
+                                                     params.K_,
+                                                     params.C_,
+                                                     params.input_spatial_lengths_,
+                                                     params.filter_spatial_lengths_,
                                                      params.GetOutputSpatialLengths(),
-                                                     params.conv_filter_strides,
-                                                     params.conv_filter_dilations,
-                                                     params.input_left_pads,
-                                                     params.input_right_pads,
+                                                     params.conv_filter_strides_,
+                                                     params.conv_filter_dilations_,
+                                                     params.input_left_pads_,
+                                                     params.input_right_pads_,
                                                      PassThrough{},
                                                      PassThrough{},
                                                      PassThrough{});
@@ -117,16 +117,16 @@ TEST(Conv3DFwdNDHWC, FiltersOver2GB)
 
     // >2GB Filters
     conv::ConvParams params;
-    params.num_dim_spatial        = 3;
-    params.N                      = 2;
-    params.K                      = 16;
-    params.C                      = 32;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{4, 1000, 1000};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{16, 16, 16};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1, 1, 1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{1, 1, 1};
-    params.input_right_pads       = std::vector<ck::index_t>{1, 1, 1};
+    params.num_dim_spatial_        = 3;
+    params.N_                      = 2;
+    params.K_                      = 16;
+    params.C_                      = 32;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{4, 1000, 1000};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{16, 16, 16};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1, 1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1, 1, 1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1, 1, 1};
 
     std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
     test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
@@ -134,16 +134,16 @@ TEST(Conv3DFwdNDHWC, FiltersOver2GB)
     auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr,
                                                      nullptr,
                                                      nullptr,
-                                                     params.N,
-                                                     params.K,
-                                                     params.C,
-                                                     params.input_spatial_lengths,
-                                                     params.filter_spatial_lengths,
+                                                     params.N_,
+                                                     params.K_,
+                                                     params.C_,
+                                                     params.input_spatial_lengths_,
+                                                     params.filter_spatial_lengths_,
                                                      params.GetOutputSpatialLengths(),
-                                                     params.conv_filter_strides,
-                                                     params.conv_filter_dilations,
-                                                     params.input_left_pads,
-                                                     params.input_right_pads,
+                                                     params.conv_filter_strides_,
+                                                     params.conv_filter_dilations_,
+                                                     params.input_left_pads_,
+                                                     params.input_right_pads_,
                                                      PassThrough{},
                                                      PassThrough{},
                                                      PassThrough{});
@@ -157,32 +157,32 @@ TEST(Conv3DFwdNDHWC, OutputOver2GB)
 
     // >2GB Output
     conv::ConvParams params;
-    params.num_dim_spatial        = 3;
-    params.N                      = 2;
-    params.K                      = 16;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{1, 1, 1};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{1000, 1000, 30};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1, 1, 1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{2, 2, 2};
-    params.input_right_pads       = std::vector<ck::index_t>{2, 2, 2};
+    params.num_dim_spatial_        = 3;
+    params.N_                      = 2;
+    params.K_                      = 16;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{1, 1, 1};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{1000, 1000, 30};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1, 1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{2, 2, 2};
+    params.input_right_pads_       = std::vector<ck::index_t>{2, 2, 2};
 
     std::vector<test::conv::DeviceConvFwdNoOpPtr> conv_ptrs;
     test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs);
     auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr,
                                                      nullptr,
                                                      nullptr,
-                                                     params.N,
-                                                     params.K,
-                                                     params.C,
-                                                     params.input_spatial_lengths,
-                                                     params.filter_spatial_lengths,
+                                                     params.N_,
+                                                     params.K_,
+                                                     params.C_,
+                                                     params.input_spatial_lengths_,
+                                                     params.filter_spatial_lengths_,
                                                      params.GetOutputSpatialLengths(),
-                                                     params.conv_filter_strides,
-                                                     params.conv_filter_dilations,
-                                                     params.input_left_pads,
-                                                     params.input_right_pads,
+                                                     params.conv_filter_strides_,
+                                                     params.conv_filter_dilations_,
+                                                     params.input_left_pads_,
+                                                     params.input_right_pads_,
                                                      PassThrough{},
                                                      PassThrough{},
                                                      PassThrough{});
diff --git a/test/convnd_fwd/conv_util.hpp b/test/convnd_fwd/conv_util.hpp
index 4f77101563..09f641b415 100644
--- a/test/convnd_fwd/conv_util.hpp
+++ b/test/convnd_fwd/conv_util.hpp
@@ -4,7 +4,6 @@
 #include <tuple>
 
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
 #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
 #include "element_wise_operation.hpp"
 #include "host_tensor.hpp"
diff --git a/test/gemm_reduce/gemm_reduce_fp16.cpp b/test/gemm_reduce/gemm_reduce_fp16.cpp
index 8deb66b2b0..6c7bb9658f 100644
--- a/test/gemm_reduce/gemm_reduce_fp16.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16.cpp
@@ -16,22 +16,22 @@ int main()
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Row, Row>(
-                   true, 1, false, 1, M, N, K, K, N, N);
+                   true, 1, false, false, M, N, K, K, N, N);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Col, Row>(
-                   true, 1, false, 1, M, N, K, K, K, N);
+                   true, 1, false, false, M, N, K, K, K, N);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Row, Row>(
-                   true, 1, false, 1, M, N, K, M, N, N);
+                   true, 1, false, false, M, N, K, M, N, N);
 
     pass = pass &&
            ck::profiler::
                profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Col, Row>(
-                   true, 1, false, 1, M, N, K, M, K, N);
+                   true, 1, false, false, M, N, K, M, K, N);
 
     if(pass)
     {
diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp
index c788b66aa3..b63361aa1b 100644
--- a/test/gemm_split_k/gemm_split_k.cpp
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -187,9 +187,10 @@ int test_gemm(const gemmArgs& args)
 
         if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
         {
-            invoker_ptr->Run(argument_ptr.get(), 0);
+            invoker_ptr->Run(argument_ptr.get());
 
             c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
             if(!check_out(c_m_n_host_result, c_m_n_device_result))
             {
                 success = false;
diff --git a/test/reference_conv_fwd/CMakeLists.txt b/test/reference_conv_fwd/CMakeLists.txt
index e5a7b31aff..04b720b169 100644
--- a/test/reference_conv_fwd/CMakeLists.txt
+++ b/test/reference_conv_fwd/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_gtest_executable(test_reference_conv_fwd reference_conv_fwd.cpp)
-target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_fwd_util)
+target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_util)
diff --git a/test/reference_conv_fwd/reference_conv_fwd.cpp b/test/reference_conv_fwd/reference_conv_fwd.cpp
index f660559e62..69b223989f 100644
--- a/test/reference_conv_fwd/reference_conv_fwd.cpp
+++ b/test/reference_conv_fwd/reference_conv_fwd.cpp
@@ -8,7 +8,7 @@
 
 #include "check_err.hpp"
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "element_wise_operation.hpp"
 #include "fill.hpp"
 #include "host_tensor.hpp"
@@ -34,21 +34,21 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
                                   const FillInputOp& fill_input_op     = FillInputOp{},
                                   const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f})
 {
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
-                                        static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
+                                        static_cast<std::size_t>(params.C_)};
     input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths),
-                      std::end(params.input_spatial_lengths));
+                      std::begin(params.input_spatial_lengths_),
+                      std::end(params.input_spatial_lengths_));
 
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
-                                         static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
+                                         static_cast<std::size_t>(params.C_)};
     filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths),
-                       std::end(params.filter_spatial_lengths));
+                       std::begin(params.filter_spatial_lengths_),
+                       std::end(params.filter_spatial_lengths_));
 
     const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
-                                         static_cast<std::size_t>(params.K)};
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
+                                         static_cast<std::size_t>(params.K_)};
     output_dims.insert(std::end(output_dims),
                        std::begin(output_spatial_lengths),
                        std::end(output_spatial_lengths));
@@ -74,10 +74,10 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
     auto ref_argument = ref_conv.MakeArgument(input,
                                               weights,
                                               host_output,
-                                              params.conv_filter_strides,
-                                              params.conv_filter_dilations,
-                                              params.input_left_pads,
-                                              params.input_right_pads,
+                                              params.conv_filter_strides_,
+                                              params.conv_filter_dilations_,
+                                              params.input_left_pads_,
+                                              params.input_right_pads_,
                                               InElementOp{},
                                               WeiElementOp{},
                                               OutElementOp{});
@@ -91,15 +91,15 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParams& params,
 TEST(ReferenceConvolutionFWD, Conv2DNHWC)
 {
     ck::utils::conv::ConvParams params;
-    params.N                      = 1;
-    params.K                      = 1;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{6, 6};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1, 1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{0, 0};
-    params.input_right_pads       = std::vector<ck::index_t>{0, 0};
+    params.N_                      = 1;
+    params.K_                      = 1;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{6, 6};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{0, 0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0, 0};
 
     auto out_tensor = run_reference_convolution_forward<2>(params);
     std::vector<std::size_t> ref_dims{1, 1, 4, 4};
@@ -127,15 +127,15 @@ TEST(ReferenceConvolutionFWD, Conv2DNHWC)
 TEST(ReferenceConvolutionFWD, Conv2DNHWCStridesDilationsPadding)
 {
     ck::utils::conv::ConvParams params;
-    params.N                      = 1;
-    params.K                      = 2;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{12, 12};
-    params.conv_filter_strides    = std::vector<ck::index_t>{2, 2};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{2, 2};
-    params.input_left_pads        = std::vector<ck::index_t>{1, 1};
-    params.input_right_pads       = std::vector<ck::index_t>{1, 1};
+    params.N_                      = 1;
+    params.K_                      = 2;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{12, 12};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{2, 2};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{2, 2};
+    params.input_left_pads_        = std::vector<ck::index_t>{1, 1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1, 1};
 
     auto out_tensor                   = run_reference_convolution_forward<2>(params);
     std::vector<std::size_t> ref_dims = std::vector<std::size_t>{1, 2, 5, 5};
@@ -153,16 +153,16 @@ TEST(ReferenceConvolutionFWD, Conv2DNHWCStridesDilationsPadding)
 TEST(ReferenceConvolutionFWD, Conv1DNWC)
 {
     ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 1;
-    params.N                      = 1;
-    params.K                      = 1;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{6};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
-    params.input_left_pads        = std::vector<ck::index_t>{0};
-    params.input_right_pads       = std::vector<ck::index_t>{0};
+    params.num_dim_spatial_        = 1;
+    params.N_                      = 1;
+    params.K_                      = 1;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{6};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1};
+    params.input_left_pads_        = std::vector<ck::index_t>{0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0};
 
     auto out_tensor =
         run_reference_convolution_forward<1,
@@ -182,16 +182,16 @@ TEST(ReferenceConvolutionFWD, Conv1DNWC)
 TEST(ReferenceConvolutionFWD, Conv1DNWCStridesDilationsPadding)
 {
     ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 1;
-    params.N                      = 1;
-    params.K                      = 2;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{12};
-    params.conv_filter_strides    = std::vector<ck::index_t>{2};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{2};
-    params.input_left_pads        = std::vector<ck::index_t>{1};
-    params.input_right_pads       = std::vector<ck::index_t>{1};
+    params.num_dim_spatial_        = 1;
+    params.N_                      = 1;
+    params.K_                      = 2;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{12};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{2};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{2};
+    params.input_left_pads_        = std::vector<ck::index_t>{1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1};
 
     auto out_tensor =
         run_reference_convolution_forward<1,
@@ -211,16 +211,16 @@ TEST(ReferenceConvolutionFWD, Conv1DNWCStridesDilationsPadding)
 TEST(ReferenceConvolutionFWD, Conv1DNWCSameOutputSize)
 {
     ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 1;
-    params.N                      = 2;
-    params.K                      = 16;
-    params.C                      = 4;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{16};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
-    params.input_left_pads        = std::vector<ck::index_t>{1};
-    params.input_right_pads       = std::vector<ck::index_t>{1};
+    params.num_dim_spatial_        = 1;
+    params.N_                      = 2;
+    params.K_                      = 16;
+    params.C_                      = 4;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{16};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1};
+    params.input_left_pads_        = std::vector<ck::index_t>{1};
+    params.input_right_pads_       = std::vector<ck::index_t>{1};
 
     auto out_tensor2 = run_reference_convolution_forward<1,
                                                          float,
@@ -305,16 +305,16 @@ TEST(ReferenceConvolutionFWD, Conv1DNWCSameOutputSize)
 TEST(ReferenceConvolutionFWD, Conv3DNCDHW)
 {
     ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 3;
-    params.N                      = 1;
-    params.K                      = 1;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{6, 6, 6};
-    params.conv_filter_strides    = std::vector<ck::index_t>{1, 1, 1};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{0, 0, 0};
-    params.input_right_pads       = std::vector<ck::index_t>{0, 0, 0};
+    params.num_dim_spatial_        = 3;
+    params.N_                      = 1;
+    params.K_                      = 1;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3, 3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{6, 6, 6};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{1, 1, 1};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{0, 0, 0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0, 0, 0};
 
     auto out_tensor = run_reference_convolution_forward<3,
                                                         float,
@@ -344,16 +344,16 @@ TEST(ReferenceConvolutionFWD, Conv3DNCDHW)
 TEST(ReferenceConvolutionFWD, Conv3DNCDHWStridesDilations)
 {
     ck::utils::conv::ConvParams params;
-    params.num_dim_spatial        = 3;
-    params.N                      = 1;
-    params.K                      = 2;
-    params.C                      = 2;
-    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3};
-    params.input_spatial_lengths  = std::vector<ck::index_t>{12, 12, 12};
-    params.conv_filter_strides    = std::vector<ck::index_t>{3, 3, 3};
-    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
-    params.input_left_pads        = std::vector<ck::index_t>{0, 0, 0};
-    params.input_right_pads       = std::vector<ck::index_t>{0, 0, 0};
+    params.num_dim_spatial_        = 3;
+    params.N_                      = 1;
+    params.K_                      = 2;
+    params.C_                      = 2;
+    params.filter_spatial_lengths_ = std::vector<ck::index_t>{3, 3, 3};
+    params.input_spatial_lengths_  = std::vector<ck::index_t>{12, 12, 12};
+    params.conv_filter_strides_    = std::vector<ck::index_t>{3, 3, 3};
+    params.conv_filter_dilations_  = std::vector<ck::index_t>{1, 1, 1};
+    params.input_left_pads_        = std::vector<ck::index_t>{0, 0, 0};
+    params.input_right_pads_       = std::vector<ck::index_t>{0, 0, 0};
 
     auto out_tensor = run_reference_convolution_forward<3,
                                                         float,