Merge branch 'develop' into moe_xcd_remap

2026-07-01 04:07:56 +00:00 · 2025-11-20 11:01:33 -08:00
parent 6ba822688a 938b8ed3bf
commit 661a4f54a0
1942 changed files with 27356 additions and 18452 deletions
--- a/.github/workflows/therock-ci-linux.yml
+++ b/.github/workflows/therock-ci-linux.yml
@@ -53,8 +53,8 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          repository: "ROCm/TheRock"
-          ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit
          path: "TheRock"
+          ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit

      - name: Setup ccache
        run: |
@@ -77,6 +77,8 @@ jobs:
      - name: Patch rocm-libraries
        run: |
          git config --global --add safe.directory '*'
+          # Remove patches here if they cannot be applied cleanly, and they have not been deleted from TheRock repo
+          rm -f ./TheRock/patches/amd-mainline/rocm-libraries/0008-Revert-remove-options-no-enumerate-966.patch
          git -c user.name="therockbot" -c "user.email=therockbot@amd.com" am --whitespace=nowarn ./TheRock/patches/amd-mainline/rocm-libraries/*.patch

      - name: Install python deps
@@ -128,7 +130,7 @@ jobs:
        run: |
          python3 TheRock/build_tools/github_actions/post_build_upload.py \
            --run-id ${{ github.run_id }} \
-            --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
+            --artifact-group ${{ env.AMDGPU_FAMILIES }} \
            --build-dir TheRock/build \
            --upload

--- a/.github/workflows/therock-test-component.yml
+++ b/.github/workflows/therock-test-component.yml
@@ -51,13 +51,13 @@ jobs:
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          repository: "ROCm/TheRock"
-          ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit
+          ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit

      - name: Run setup test environment workflow
        uses: './.github/actions/setup_test_environment'
        with:
          ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
-          AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+          ARTIFACT_GROUP: ${{ inputs.amdgpu_families }}
          OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
          VENV_DIR: ${{ env.VENV_DIR }}
          FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }}
--- a/.github/workflows/therock-test-packages.yml
+++ b/.github/workflows/therock-test-packages.yml
@@ -27,7 +27,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          repository: "ROCm/TheRock"
-          ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit
+          ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit

      - name: "Configuring CI options"
        env:
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,11 +2,10 @@

 Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/).

-## (Unreleased) Composable Kernel for ROCm
-
-### Added 
+## Composable Kernel 1.2.0 for ROCm 7.2.0

 ### Added
+* Added support for bf16 data type to grouped_gemm and grouped_gemm_preshuffle.
 * Added support for mixed precision fp8 x bf8 universal GEMM and weight preshuffle GEMM
 * Added a compute async pipeline in the CK TILE universal GEMM on gfx950
 * Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM.
@@ -32,6 +31,16 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added an optional template parameter `Arch` (`gfx9_t`, `gfx12_t` etc.) to `make_kernel` to support linking multiple object files that have the same kernel compiled for different architectures.
 * FMHA examples and tests can be built for multiple architectures (gfx9, gfx950, gfx12) at the same time.

+### Upcoming changes
+
+* Composable Kernel will be adopting C++20 features in an upcoming ROCm release, updating the minimum compiler requirement to C++20. Ensure that your development environment complies with this requirement to facilitate a seamless transition.
+
+## Composable Kernel 1.1.0 for ROCm 7.1.1
+
+### Upcoming changes
+
+* Composable Kernel will be adopting C++20 features in an upcoming ROCm release, updating the minimum compiler requirement to C++20. Ensure that your development environment complies with this requirement to facilitate a seamless transition.
+
 ## Composable Kernel 1.1.0 for ROCm 7.1.0

 ### Added
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -122,7 +122,7 @@ add_compile_options(
 # Recent change in compiler makes this warning ON by default, which led to compile errors.
 add_compile_options(-Wno-nrvo)

-if(NOT DISABLE_DL_KERNELS)
+if(NOT DISABLE_DL_KERNELS AND GPU_TARGETS MATCHES "gfx101|gfx103|gfx10-1|gfx10-3")
    add_definitions(-DDL_KERNELS)
    set(DL_KERNELS "ON")
    set(CK_ENABLE_DL_KERNELS "ON")
@@ -683,6 +683,12 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY)
        PACKAGE_NAME examples
   )
   add_subdirectory(example)
+
+   add_subdirectory(tutorial)
+   rocm_package_setup_component(tutorials
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME tutorials
+   )
   add_subdirectory(tile_engine)
   if(BUILD_TESTING)
       add_subdirectory(test)
--- a/Dockerfile.aiter
+++ b/Dockerfile.aiter
@@ -17,4 +17,6 @@ RUN pip install pandas zmq einops ninja && \
    useradd -u 1001 -g 1001 -m -s /bin/bash jenkins && \
    chown -R jenkins:jenkins /home/jenkins && \
    chmod -R a+rwx /home/jenkins && \
+    chown -R jenkins:jenkins /tmp && \
+    chmod -R a+rwx /tmp && \
    sudo usermod -aG irc jenkins
--- a/519
+++ b/519
@@ -213,39 +213,6 @@ def check_host() {
    }
 }

-def build_compiler(){
-    def compiler
-    compiler = "${params.BUILD_COMPILER}"
-    return compiler
-}
-
-def check_arch(){
-    def arch_type = 0
-    sh 'rocminfo | tee rocminfo.log'
-    if ( runShell('grep -n "gfx90a" rocminfo.log') ){
-        arch_type = 1
-    }
-    else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
-        arch_type = 2
-    }
-    else if ( runShell('grep -n "gfx10" rocminfo.log') ) {
-        arch_type = 3
-    }
-    else if ( runShell('grep -n "gfx11" rocminfo.log') ) {
-        arch_type = 4
-    }
-    else if ( runShell('grep -n "gfx12" rocminfo.log') ) {
-        arch_type = 5
-    }
-    else if ( runShell('grep -n "gfx908" rocminfo.log') ) {
-        arch_type = 6
-    }
-    else if ( runShell('grep -n "gfx950" rocminfo.log') ) {
-        arch_type = 7
-    }
-    return arch_type
-}
-
 def check_arch_name(){
    def arch_name = ""
    sh 'rocminfo | tee rocminfo.log'
@@ -261,7 +228,7 @@ def check_arch_name(){
    else if ( runShell('grep -n "gfx11" rocminfo.log') ) {
        arch_name = "gfx11"
    }
-    else if ( runShell('grep -n "gfx12" rocminfo.log') ) {
+    else if ( runShell('grep -n "gfx120" rocminfo.log') ) {
        arch_name = "gfx12"
    }
    else if ( runShell('grep -n "gfx908" rocminfo.log') ) {
@@ -274,21 +241,8 @@ def check_arch_name(){
 }

 def getDockerImage(Map conf=[:]){
-    env.DOCKER_BUILDKIT=1
-    def prefixpath = conf.get("prefixpath", "/opt/rocm")
-    def no_cache = conf.get("no_cache", false)
-    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' "
-    if(no_cache)
-    {
-        dockerArgs = dockerArgs + " --no-cache "
-    }
-    echo "Docker Args: ${dockerArgs}"
    def image
-    if ( params.BUILD_LEGACY_OS && conf.get("docker_name", "") != "" ){
-        image = conf.get("docker_name", "")
-        echo "Using legacy docker: ${image}"
-    }
-    else if ( (params.BUILD_GFX950 || params.RUN_CK_TILE_FMHA_TESTS) && conf.get("docker_name", "") != "" ){
+    if ( conf.get("docker_name", "") != "" ){
        image = conf.get("docker_name", "")
        echo "Using special docker: ${image}"
    }
@@ -361,11 +315,51 @@ def buildDocker(install_prefix){
    }
 }

+def get_docker_options(){
+    def dockerOpts
+    if ( params.BUILD_INSTANCES_ONLY ){
+        dockerOpts = "--network=host --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+    }
+    else{ //only add kfd and dri paths if you actually going to run somthing on GPUs
+        dockerOpts = "--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+    }
+    if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
+    // the  --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with
+    // newer clang22 compilers and running with older hip runtima libraries
+        dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 "
+    }
+    // on some machines the group ids for video and render groups may not be the same as in the docker image!
+    def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
+    def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
+    dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
+    echo "Docker flags: ${dockerOpts}"
+    return dockerOpts
+}
+
+def build_client_examples(String arch){
+    def cmd = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+                cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+                -DGPU_TARGETS="${arch}" \
+                -DCMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \
+                -DCMAKE_HIP_COMPILER="${params.BUILD_COMPILER}" \
+                -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+    return cmd
+}
+
+def build_and_run_fmha(String arch){
+    def cmd = """ cmake -G Ninja -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+                -DGPU_TARGETS="${arch}" \
+                -DCMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \
+                -DCMAKE_HIP_COMPILER="${params.BUILD_COMPILER}" .. && \
+                ninja -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
+                cd ../ &&
+                example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" "${arch}" """
+    return cmd
+}
+
 def cmake_build(Map conf=[:]){

-    def compiler = build_compiler()
    def config_targets = conf.get("config_targets","check")
-    def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "")
    def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","")
    def prefixpath = conf.get("prefixpath","/opt/rocm")
    def setup_args = conf.get("setup_args","")
@@ -376,10 +370,8 @@ def cmake_build(Map conf=[:]){
        setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
    }

-    def build_type_debug = (conf.get("build_type",'release') == 'debug')
-
    //cmake_env can overwrite default CXX variables.
-    def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","")
+    def cmake_envs = "CXX=${params.BUILD_COMPILER} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","")

    if(conf.get("build_install","") == "true")
    {
@@ -392,15 +384,10 @@ def cmake_build(Map conf=[:]){
        setup_args = setup_args + " -DDISABLE_DL_KERNELS=ON "
    }

-    if(build_type_debug){
-        setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args
-    }else{
-        setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
-    }
+    setup_args = " -DCMAKE_BUILD_TYPE=release " + setup_args

    def pre_setup_cmd = """
            #!/bin/bash
-            echo \$HSA_ENABLE_SDMA
            ulimit -c unlimited
            rm -rf build
            mkdir build
@@ -478,13 +465,12 @@ def cmake_build(Map conf=[:]){
    def build_cmd
    def execute_cmd = conf.get("execute_cmd", "")
    if(!setup_args.contains("NO_CK_BUILD")){
-        def cmake_flags = params.NINJA_FTIME_TRACE ? "-O3 -ftime-trace" : "-O3"
        if (params.NINJA_BUILD_TRACE) {
            echo "running ninja build trace"
        }
        setup_cmd = conf.get(
            "setup_cmd",
-            """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" ${cmake_flags} " .. """
+            """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" -O3 " .. """
        )
        build_cmd = conf.get(
            "build_cmd",
@@ -566,16 +552,11 @@ def cmake_build(Map conf=[:]){
    }

    //check the node gpu architecture
-    def arch = check_arch()
+    def arch_name = check_arch_name()
    if (params.RUN_CK_TILE_FMHA_TESTS){
        try{
            archiveArtifacts "perf_fmha_*.log"
-            if (arch == 1){
-                stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a"
-            }
-            else if (arch == 2){
-                stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942"
-            }
+            stash includes: "perf_fmha_**.log", name: "perf_fmha_log_${arch_name}"
        }
        catch(Exception err){
            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
@@ -585,40 +566,15 @@ def cmake_build(Map conf=[:]){

 def buildHipClangJob(Map conf=[:]){
        show_node_info()
-
-        env.HSA_ENABLE_SDMA=0
        checkout scm
        def prefixpath = conf.get("prefixpath", "/opt/rocm")
-
-        // Jenkins is complaining about the render group
-        def dockerOpts
-        if ( params.BUILD_INSTANCES_ONLY ){
-            dockerOpts = "--group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
-        }
-        else{
-            dockerOpts = "--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
-        }
-        if (conf.get("enforce_xnack_on", false)) {
-            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
-        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
-            // the  --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with
-            // newer clang22 compilers and running with older hip runtima libraries
-            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 "
-        }
-        def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
-        def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
-        dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
-        echo "Docker flags: ${dockerOpts}"
-
-        def variant = env.STAGE_NAME
+        def dockerOpts = get_docker_options()
        def image
        def retimage
        (retimage, image) = getDockerImage(conf)

-        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
-            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') {
+            withDockerContainer(image: image, args: dockerOpts) {
                timeout(time: 20, unit: 'HOURS')
                {
                    cmake_build(conf)
@@ -628,10 +584,6 @@ def buildHipClangJob(Map conf=[:]){
        return retimage
 }

-def reboot(){
-    build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
-}
-
 def buildHipClangJobAndReboot(Map conf=[:]){
    try{
        buildHipClangJob(conf)
@@ -641,45 +593,17 @@ def buildHipClangJobAndReboot(Map conf=[:]){
        echo 'Exception occurred: ' + e.toString()
        throw e
    }
-    finally{
-        if (!conf.get("no_reboot", false)) {
-            reboot()
-        }
-    }
 }

 def Build_CK(Map conf=[:]){
        show_node_info()
-
-        env.HSA_ENABLE_SDMA=0
-        env.DOCKER_BUILDKIT=1
        checkout scm
        def prefixpath = conf.get("prefixpath", "/opt/rocm")
-
-        // Jenkins is complaining about the render group
-        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
-        if (conf.get("enforce_xnack_on", false)) {
-            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
-        }
-        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
-        if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
-            // the  --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with
-            // newer clang22 compilers and running with older hip runtima libraries
-            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 "
-        }
-        if(params.BUILD_LEGACY_OS){
-            dockerOpts = dockerOpts + " --env LD_LIBRARY_PATH='/opt/Python-3.8.13/lib' "
-        }
-        def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
-        def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
-        dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
-        echo "Docker flags: ${dockerOpts}"
-
-        def variant = env.STAGE_NAME
+        def dockerOpts=get_docker_options()
        def image
        def retimage

-        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
+        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') {
            try {
                (retimage, image) = getDockerImage(conf)
                withDockerContainer(image: image, args: dockerOpts) {
@@ -698,11 +622,11 @@ def Build_CK(Map conf=[:]){
                echo "The job was cancelled or aborted"
                throw e
            }
-            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+            withDockerContainer(image: image, args: dockerOpts) {
                timeout(time: 20, unit: 'HOURS')
                {
                    //check whether to run performance tests on this node
-                    def arch = check_arch()
+                    def arch = check_arch_name()
                    cmake_build(conf)
                    if ( params.RUN_INDUCTOR_TESTS && !params.BUILD_LEGACY_OS && arch == 1 ){
                            echo "Run inductor codegen tests"
@@ -717,94 +641,50 @@ def Build_CK(Map conf=[:]){
                    // run performance tests, stash the logs, results will be processed on the master node
 					dir("script"){
                        if (params.RUN_PERFORMANCE_TESTS){
-                        if (params.RUN_FULL_QA && arch == 1){
-                            // run full tests on gfx90a
+                        if (params.RUN_FULL_QA && (arch == "gfx90a" || arch == "gfx942")){
+                            // run full tests on gfx90a or gfx942
                            echo "Run full performance tests"
-                            sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx90a"
-                            archiveArtifacts "perf_gemm_gfx90a.log"
-                            archiveArtifacts "perf_resnet50_N256_gfx90a.log"
-                            archiveArtifacts "perf_resnet50_N4_gfx90a.log"
-                            archiveArtifacts "perf_batched_gemm_gfx90a.log"
-                            archiveArtifacts "perf_grouped_gemm_gfx90a.log"
-                            archiveArtifacts "perf_grouped_conv_fwd_gfx90a.log"
-                            archiveArtifacts "perf_grouped_conv_bwd_data_gfx90a.log"
-                            archiveArtifacts "perf_grouped_conv_bwd_weight_gfx90a.log"
-                            archiveArtifacts "perf_gemm_bilinear_gfx90a.log"
-                            archiveArtifacts "perf_reduction_gfx90a.log"
-                            archiveArtifacts "perf_splitK_gemm_gfx90a.log"
-                            archiveArtifacts "perf_onnx_gemm_gfx90a.log"
-                            archiveArtifacts "perf_mixed_gemm_gfx90a.log"
-                            stash includes: "perf_**.log", name: "perf_log_gfx90a"
+                            sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} ${arch}"
+                            archiveArtifacts "perf_*.log"
+                            stash includes: "perf_**.log", name: "perf_log_${arch}"
                        }
-                        if (params.RUN_FULL_QA && arch == 2){
-                            // run full tests on gfx942
-                            echo "Run full performance tests"
-                            sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx942"
-                            archiveArtifacts "perf_gemm_gfx942.log"
-                            archiveArtifacts "perf_resnet50_N256_gfx942.log"
-                            archiveArtifacts "perf_resnet50_N4_gfx942.log"
-                            archiveArtifacts "perf_batched_gemm_gfx942.log"
-                            archiveArtifacts "perf_grouped_gemm_gfx942.log"
-                            archiveArtifacts "perf_grouped_conv_fwd_gfx942.log"
-                            archiveArtifacts "perf_grouped_conv_bwd_data_gfx942.log"
-                            archiveArtifacts "perf_grouped_conv_bwd_weight_gfx942.log"
-                            archiveArtifacts "perf_gemm_bilinear_gfx942.log"
-                            archiveArtifacts "perf_reduction_gfx942.log"
-                            archiveArtifacts "perf_splitK_gemm_gfx942.log"
-                            archiveArtifacts "perf_onnx_gemm_gfx942.log"
-                            archiveArtifacts "perf_mixed_gemm_gfx942.log"
-                            stash includes: "perf_**.log", name: "perf_log_gfx942"
-                        }
-                        else if ( arch == 1 ){
-                            // run standard tests on gfx90a
+                        else if (!params.RUN_FULL_QA && (arch == "gfx90a" || arch == "gfx942")){
+                            // run standard tests on gfx90a or gfx942
                            echo "Run performance tests"
-                            sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx90a"
-                            archiveArtifacts "perf_gemm_gfx90a.log"
-                            archiveArtifacts "perf_onnx_gemm_gfx90a.log"
-                            archiveArtifacts "perf_resnet50_N256_gfx90a.log"
-                            archiveArtifacts "perf_resnet50_N4_gfx90a.log"
-                            stash includes: "perf_**.log", name: "perf_log_gfx90a"
-                        }
-                        else if ( arch == 2 ){
-                            // run standard tests on gfx942
-                            echo "Run performance tests"
-                            sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx942"
-                            archiveArtifacts "perf_gemm_gfx942.log"
-                            archiveArtifacts "perf_onnx_gemm_gfx942.log"
-                            archiveArtifacts "perf_resnet50_N256_gfx942.log"
-                            archiveArtifacts "perf_resnet50_N4_gfx942.log"
-                            stash includes: "perf_**.log", name: "perf_log_gfx942"
+                            sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} ${arch}"
+                            archiveArtifacts "perf_*.log"
+                            stash includes: "perf_**.log", name: "perf_log_${arch}"
                        }
                        // disable performance tests on gfx1030 for now.
-                        //else if ( arch == 3){
+                        //else if ( arch == "gfx10"){
                            // run basic tests on gfx1030
                        //    echo "Run gemm performance tests"
                        //    sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx10"
                        //    archiveArtifacts "perf_onnx_gemm_gfx10.log"
                        //    stash includes: "perf_onnx_gemm_gfx10.log", name: "perf_log_gfx10"
                        //}
-                        else if ( arch == 4){
+                        else if ( arch == "gfx11"){
                            // run basic tests on gfx11
                            echo "Run gemm performance tests"
                            sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx11"
                            archiveArtifacts "perf_onnx_gemm_gfx11.log"
                            stash includes: "perf_onnx_gemm_gfx11.log", name: "perf_log_gfx11"
                        }
-                        else if ( arch == 5 ){
+                        else if ( arch == "gfx120" ){
                            // run basic tests on gfx12
                            echo "Run gemm performance tests"
                            sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12"
                            archiveArtifacts "perf_onnx_gemm_gfx12.log"
                            stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12"
                        }
-                        else if ( arch == 6 ){
+                        else if ( arch == "gfx908" ){
                            // run basic tests on gfx908
                            echo "Run performance tests"
                            sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx908"
                            archiveArtifacts "perf_onnx_gemm_gfx908.log"
                            stash includes: "perf_onnx_gemm_gfx908.log", name: "perf_log_gfx908"
                        }
-                        else if ( arch == 7 ){
+                        else if ( arch == "gfx950" ){
                            // run basic tests on gfx950
                            echo "Run performance tests"
                            sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx950"
@@ -813,7 +693,7 @@ def Build_CK(Map conf=[:]){
                        }
                        }
                    }
-                    if (params.hipTensor_test && arch == 1 ){
+                    if (params.hipTensor_test && arch == "gfx90a" ){
                        // build and test hipTensor on gfx90a node
                        sh """#!/bin/bash
                            rm -rf "${params.hipTensor_branch}".zip
@@ -846,34 +726,18 @@ def Build_CK_and_Reboot(Map conf=[:]){
        echo 'Exception occurred: ' + e.toString()
        throw e
    }
-    finally{
-        if (!conf.get("no_reboot", false)) {
-            reboot()
-        }
-    }
 }

 def process_results(Map conf=[:]){
-    env.HSA_ENABLE_SDMA=0
    checkout scm
    //use older image that has user jenkins
    def image = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm6.3"
-    def prefixpath = "/opt/rocm"

-    // Jenkins is complaining about the render group
-    def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
-    if (conf.get("enforce_xnack_on", false)) {
-        dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
-    }
-
-    def variant = env.STAGE_NAME
-    def retimage
-
-    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') {
        try
        {
            echo "Pulling image: ${image}"
-            retimage = docker.image("${image}")
+            def retimage = docker.image("${image}")
            withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
                retimage.pull()
            }
@@ -884,7 +748,7 @@ def process_results(Map conf=[:]){
        }
    }

-    withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+    withDockerContainer(image: image, args: '--cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v=/var/jenkins/:/var/jenkins') {
        timeout(time: 15, unit: 'MINUTES'){
            try{
                dir("script"){
@@ -998,19 +862,12 @@ def process_results(Map conf=[:]){

 def run_aiter_tests(Map conf=[:]){
    show_node_info()
-    env.HSA_ENABLE_SDMA=0
    checkout scm
    //use the latest pytorch image
    def image = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter"
-    def dockerOpts="--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --user=jenkins -v=/var/jenkins/:/var/jenkins"
-    def variant = env.STAGE_NAME
-    def retimage
-    def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
-    def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
-    dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
-    echo "Docker flags: ${dockerOpts}"
+    def dockerOpts=get_docker_options()

-    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') {
        try
        {
            echo "Pulling image: ${image}"
@@ -1057,19 +914,12 @@ def run_aiter_tests(Map conf=[:]){

 def run_pytorch_tests(Map conf=[:]){
    show_node_info()
-    env.HSA_ENABLE_SDMA=0
    checkout scm
    //use the latest pytorch-nightly image
    def image = "${env.CK_DOCKERHUB}:ck_pytorch"
-    def dockerOpts="--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --user=jenkins -v=/var/jenkins/:/var/jenkins"
-    def variant = env.STAGE_NAME
-    def retimage
-    def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
-    def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
-    dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
-    echo "Docker flags: ${dockerOpts}"
+    def dockerOpts=get_docker_options()

-    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
+    gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') {
        try
        {
            echo "Pulling image: ${image}"
@@ -1343,7 +1193,7 @@ pipeline {
                                --file-filter=*.cpp --force --enable=all --output-file=ck_cppcheck.log"
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
+                        buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd)
                        archiveArtifacts "build/ck_cppcheck.log"
                        cleanWs()
                    }
@@ -1369,7 +1219,7 @@ pipeline {
                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\')"
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
+                        buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd)
                        cleanWs()
                    }
                }
@@ -1453,7 +1303,7 @@ pipeline {
                                           ./bin/test_grouped_convnd_fwd_large_cases_xdl && ./bin/test_grouped_convnd_bwd_data_xdl_large_cases && ./bin/test_grouped_convnd_fwd_bias_clamp_large_cases"""
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
                    }
                }
@@ -1489,7 +1339,7 @@ pipeline {
                                           ./bin/test_grouped_convnd_fwd_dataset_xdl"""
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
                    }
                }
@@ -1512,11 +1362,11 @@ pipeline {
                    agent{ label rocmnode("gfx90a")}
                    environment{
                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake -DCMAKE_PREFIX_PATH=/opt/rocm ../codegen && \
+                        execute_args = """ cmake -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" ../codegen && \
                                           make -j64 check"""
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
                    }
                }
@@ -1539,13 +1389,10 @@ pipeline {
                    agent{ label rocmnode("gfx90a") }
                    environment{
                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
-                                           make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
-                                           cd ../ &&
-                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
+                        execute_args = build_and_run_fmha("gfx90a")
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
                    }
                }
@@ -1558,13 +1405,10 @@ pipeline {
                    agent{ label rocmnode("gfx942") }
                    environment{
                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
-                                           make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
-                                           cd ../ &&
-                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
+                        execute_args = build_and_run_fmha("gfx942")
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
                    }
                }
@@ -1577,13 +1421,10 @@ pipeline {
                    agent{ label rocmnode("gfx950") }
                    environment{
                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx950 && \
-                                           make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
-                                           cd ../ &&
-                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx950 """
+                        execute_args = build_and_run_fmha("gfx950")
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
                    }
                }
@@ -1596,13 +1437,10 @@ pipeline {
                    agent{ label rocmnode("gfx1201") }
                    environment{
                        setup_args = "NO_CK_BUILD"
-                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx12-generic && \
-                                           make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
-                                           cd ../ &&
-                                           example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx1201 """
+                        execute_args = build_and_run_fmha("gfx1201")
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
                    }
                }
@@ -1626,7 +1464,7 @@ pipeline {
                    environment{
                        setup_args = "NO_CK_BUILD"
                        execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
-                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
+                                            -D CMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \
                                            -D CMAKE_BUILD_TYPE=Release \
                                            -D GPU_TARGETS="gfx90a" \
                                            -D GEMM_DATATYPE="fp8;fp16" \
@@ -1634,20 +1472,14 @@ pipeline {
                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
-                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" \
-                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j64 benchmark_gemm_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \
-                                           --warmup 5 --repeat 5 --verbose --json results.json && \
-                                           ninja -j64 benchmark_gemm_preshuffle_all && \
-                                           python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" \
-                                           --warmup 5 --repeat 5 --verbose --json results.json && \
-                                           ninja -j64 benchmark_gemm_multi_d_all && \
-                                           python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" \
-                                           --warmup 5 --repeat 5 --verbose --json results.json """
+                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
+                                           ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
+                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
                    }
                }
@@ -1661,7 +1493,7 @@ pipeline {
                    environment{
                        setup_args = "NO_CK_BUILD"
                        execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
-                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
+                                            -D CMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \
                                            -D CMAKE_BUILD_TYPE=Release \
                                            -D GPU_TARGETS="gfx942" \
                                            -D GEMM_DATATYPE="fp8;fp16" \
@@ -1669,20 +1501,14 @@ pipeline {
                                            -D GEMM_MULTI_D_DATATYPE="fp16" \
                                            -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                            -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
-                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" \
-                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j64 benchmark_gemm_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \
-                                           --warmup 5 --repeat 5 --verbose --json results.json && \
-                                           ninja -j64 benchmark_gemm_preshuffle_all && \
-                                           python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" \
-                                           --warmup 5 --repeat 5 --verbose --json results.json && \
-                                           ninja -j64 benchmark_gemm_multi_d_all && \
-                                           python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" \
-                                           --warmup 5 --repeat 5 --verbose --json results.json """
+                                            -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
+                                           ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
+                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
                    }
                }
@@ -1696,18 +1522,16 @@ pipeline {
                    environment{
                        setup_args = "NO_CK_BUILD"
                        execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
-                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
+                                            -D CMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \
                                            -D CMAKE_BUILD_TYPE=Release \
                                            -D GPU_TARGETS="gfx1201" \
                                            -D GEMM_DATATYPE="fp16" \
-                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
-                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
+                                            -D GEMM_LAYOUT="rcr;rrr;crr;ccr" .. && \
                                           ninja -j64 benchmark_gemm_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \
-                                           --warmup 5 --repeat 5 --verbose --json results.json """
+                                           python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                        cleanWs()
                    }
                }
@@ -1730,15 +1554,11 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx90a") }
                    environment{
-                        def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_rhel8_rocm6.3"
-                        setup_args = """ -DGPU_TARGETS="gfx942" \
-                                         -DCMAKE_CXX_FLAGS=" -O3 " \
-                                         -DCK_CXX_STANDARD="17" \
-                                         -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
+                        setup_args = """ -DGPU_TARGETS="gfx942" -DCK_CXX_STANDARD="17" -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
                        execute_args = " "
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", build_type: 'Release', docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_rhel8_rocm6.3")
                        cleanWs()
                    }
                }
@@ -1750,14 +1570,11 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx90a") }
                    environment{
-                        def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_sles15_rocm6.3"
-                        setup_args = """ -DGPU_TARGETS="gfx942" \
-                                         -DCMAKE_CXX_FLAGS=" -O3 " \
-                                         -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
+                        setup_args = """ -DGPU_TARGETS="gfx942" -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
                        execute_args = " "
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", build_type: 'Release', docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_sles15_rocm6.3")
                        cleanWs()
                    }
                }
@@ -1769,18 +1586,11 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx942") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
-                                         -DGPU_TARGETS="gfx942" \
-                                         -DCMAKE_CXX_FLAGS=" -O3 " """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
-                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx942" \
-                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
-                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx942" """
+                        execute_args = build_client_examples("gfx942")
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                        cleanWs()
                    }
                }
@@ -1792,18 +1602,11 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx950") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
-                                         -DGPU_TARGETS="gfx950" \
-                                         -DCMAKE_CXX_FLAGS=" -O3 " """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
-                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx950" \
-                                           -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
-                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx950" """
+                        execute_args = build_client_examples("gfx950")
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                        cleanWs()
                    }
                }
@@ -1815,16 +1618,11 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx908") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908" -DCMAKE_CXX_FLAGS=" -O3 " """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
-                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx908" \
-                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
-                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908" """
+                        execute_args = build_client_examples("gfx908")
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                        cleanWs()
                    }
                }
@@ -1836,17 +1634,11 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx90a") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx90a" -DCK_CXX_STANDARD="17" -DCMAKE_CXX_FLAGS=" -O3 " """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
-                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx90a" \
-                                           -DCK_CXX_STANDARD="17" \
-                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
-                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx90a" -DCK_CXX_STANDARD="17" """
+                        execute_args = build_client_examples("gfx90a")
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                        cleanWs()
                    }
                }
@@ -1859,17 +1651,12 @@ pipeline {
                    agent{ label rocmnode("gfx942") }
                    steps{
                        script {
-                            def execute_args = params.NINJA_FTIME_TRACE ?
-                                """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
-                                    -D CMAKE_CXX_COMPILER="${build_compiler()}" \
-                                    -D CMAKE_BUILD_TYPE=Release \
-                                    -D CMAKE_CXX_FLAGS=" -O3 -ftime-trace" .. && ninja -j64 """ :
-                                """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
-                                    -D CMAKE_CXX_COMPILER="${build_compiler()}" \
-                                    -D CMAKE_BUILD_TYPE=Release \
-                                    -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
+                            def execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
+                                                -DCMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \
+                                                -DCMAKE_HIP_COMPILER="${params.BUILD_COMPILER}" \
+                                                -D CMAKE_BUILD_TYPE=Release .. && ninja -j64 """

-                            buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1")
+                            buildHipClangJobAndReboot(setup_cmd: "",  build_cmd: "", build_type: 'Release', execute_cmd: execute_args)
                        }
                        cleanWs()
                    }
@@ -1882,16 +1669,11 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx1030") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx10-3-generic" -DCMAKE_CXX_FLAGS=" -O3 " """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
-                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx10-3-generic" \
-                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
-                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx10-3-generic" """
+                        execute_args = build_client_examples("gfx10-3-generic")
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                        cleanWs()
                    }
                }
@@ -1903,16 +1685,11 @@ pipeline {
                    }
                    agent{ label 'miopen && (gfx1101 || gfx1100)' }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" -DCMAKE_CXX_FLAGS=" -O3 " """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
-                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx11-generic" \
-                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
-                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" """
+                        execute_args = build_client_examples("gfx11-generic")
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                        cleanWs()
                    }
                }
@@ -1924,16 +1701,11 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx1201") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx12-generic" -DCMAKE_CXX_FLAGS=" -O3 " """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
-                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx12-generic" \
-                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
-                                           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx12-generic" """
+                        execute_args = build_client_examples("gfx12-generic")
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                        cleanWs()
                    }
                }
@@ -1942,8 +1714,7 @@ pipeline {
                success {
                    script {
                        // Report the parent stage build ck and run tests status
-                        def variant = env.STAGE_NAME
-                        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
+                        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') {
                            echo "Reporting success status for build ck and run tests"
                        }
                    }
@@ -1970,13 +1741,11 @@ pipeline {
                success {
                    script {
                        // Report the skipped parent's stage status
-                        def parentVariant = "Process Performance Test Results"
-                        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${parentVariant}", account: 'ROCm', repo: 'composable_kernel') {
+                        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Process Performance Test Results", account: 'ROCm', repo: 'composable_kernel') {
                            echo "Process Performance Test Results stage skipped."
                        }
                        // Report the skipped stage's status
-                        def variant = "Process results"
-                        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
+                        gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Process results", account: 'ROCm', repo: 'composable_kernel') {
                            echo "Process Performance Test Results stage skipped."
                        }
                    }
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
@@ -24,6 +24,8 @@
 #include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp"
 #include "ck/library/utility/host_tensor.hpp"

+using ::ck::HostTensorDescriptor;
+
 using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
 using ConvScaleRelu = ck::tensor_operation::element_wise::ScaleScaleRelu;
 using ConvScale     = ck::tensor_operation::element_wise::ScaleScalePass;
--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_bias_fastgelu_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_bias_fastgelu_xdl_bf16_i8.cpp
@@ -15,6 +15,8 @@

 #include "ck/library/tensor_operation_instance/gpu/grouped_gemm_multi_abd_fixed_nk.hpp"

+using ::ck::hip_check_error;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_fastgelu_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_fastgelu_xdl_bf16_i8.cpp
@@ -17,6 +17,8 @@

 #include "ck/host_utility/hip_check_error.hpp"

+using ::ck::hip_check_error;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp
@@ -17,6 +17,8 @@

 #include "ck/host_utility/hip_check_error.hpp"

+using ::ck::hip_check_error;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp
@@ -17,6 +17,8 @@

 #include "ck/host_utility/hip_check_error.hpp"

+using ::ck::hip_check_error;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_xdl_bf16_i8.cpp
+++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_xdl_bf16_i8.cpp
@@ -17,6 +17,8 @@

 #include "ck/host_utility/hip_check_error.hpp"

+using ::ck::hip_check_error;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -105,7 +105,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
    endif()
 endforeach()

-list(APPEND gpu_list_tf32 gfx942)
+list(APPEND gpu_list_tf32 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
@@ -25,6 +25,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;

 struct ProblemSize final
 {
--- a/example/01_gemm/gemm_wmma_fp8_v3.cpp
+++ b/example/01_gemm/gemm_wmma_fp8_v3.cpp
@@ -13,7 +13,7 @@ using CDataType        = ck::bhalf_t;
 using ComputeTypeA     = ck::f8_t;
 using ComputeTypeB     = ck::f8_t;

-using ALayout = Row;
+using ALayout = Col;
 using BLayout = Col;
 using CLayout = Row;

@@ -30,13 +30,13 @@ using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuf
    PassThrough, PassThrough, PassThrough, GemmDefault,
    128,
    128, 64, 64,
-    8, 8,
+    16, 16, // AK1, BK1
    16, 16,
    4, 2,
+    S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
+    1, 4, 16, 0,
    S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
-    2, 8, 8, 0,
-    S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
-    2, 8, 8, 0,
+    2, 16, 16, 0,
    1, 1, S<1, 32, 1, 4>, 8,
    ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1,
    ComputeTypeA, ComputeTypeB>;
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/utility/check_err.hpp"
 #include "ck/host_utility/device_prop.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 struct AlphaBetaAdd
 {
    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/utility/check_err.hpp"
 #include "ck/host_utility/device_prop.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 struct AlphaBetaAdd
 {
    AlphaBetaAdd(int alpha, int beta) : alpha_(alpha), beta_(beta){};
--- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -18,6 +18,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 struct AlphaBetaAdd
 {
    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
--- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/04_gemm_add_add_fastgelu/common.hpp
+++ b/example/04_gemm_add_add_fastgelu/common.hpp
@@ -23,6 +23,10 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -21,7 +21,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
    endif()
 endforeach()

-list(APPEND gpu_list_tf32 gfx942)
+list(APPEND gpu_list_tf32 gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
--- a/example/09_convnd_fwd/convnd_fwd_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
@@ -19,6 +19,10 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 void print_helper_msg()
 {
    std::cout << "arg1: verification (0=no, 1=yes)\n"
@@ -77,7 +81,7 @@ inline __host__ __device__ constexpr double get_atol()
 {
    if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
    {
-        return 1e-2;
+        return 1e-3;
    }
    else if constexpr(std::is_same_v<DataType, float>)
    {
--- a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
@@ -19,6 +19,10 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 void print_helper_msg()
 {
    std::cout << "arg1: verification (0=no, 1=yes)\n"
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
@@ -26,6 +26,10 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using BF16 = ck::bhalf_t;
 using FP16 = ck::half_t;
 using FP32 = float;
--- a/example/12_reduce/reduce_blockwise_two_call.cpp
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -20,6 +20,10 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/host_common_util.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using namespace ck;
 using namespace ck::tensor_operation::device;

--- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
@@ -19,6 +19,10 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <typename InDataType,
          typename OutDataType,
          typename ComputeDataType,
--- a/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp
@@ -20,6 +20,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/CMakeLists.txt
+++ b/example/15_grouped_gemm/CMakeLists.txt
@@ -33,3 +33,13 @@ if(USE_BITINT_EXTENSION_INT4)
    add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp)
    add_example_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4)
 endif()
+
+list(APPEND gpu_list_tf32 gfx942 gfx950)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+    if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0)
+        add_example_executable(example_grouped_gemm_xdl_fp32_tf32 grouped_gemm_xdl_fp32_tf32.cpp)
+        add_example_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fp32_tf32)
+        set(target 1)
+    endif()
+endforeach()
--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp
@@ -25,6 +25,11 @@
 #include "ck/utility/tuple.hpp"
 #include "ck/utility/sequence.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp
@@ -23,6 +23,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp
@@ -23,6 +23,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp
@@ -20,6 +20,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
@@ -20,6 +20,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
@@ -20,6 +20,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32_tf32.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32_tf32.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+#define EXAMPLE_WITH_COMPUTE_DATATYPE
+
+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+using ComputeDataType  = ck::tf32_t;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   16,   16,    8,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4, ck::LoopScheduler::Default, ComputeDataType>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
+
+#undef EXAMPLE_WITH_COMPUTE_DATATYPE
--- a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp
@@ -19,6 +19,11 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -3,6 +3,11 @@

 #pragma once

+// use macro to minimize code change
+#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
+using ComputeDataType = AccDataType;
+#endif
+
 struct ProblemSize final
 {
    std::vector<ck::index_t> Ms;
@@ -231,7 +236,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
                                                                                AccDataType,
                                                                                AElementOp,
                                                                                BElementOp,
-                                                                                CDEElementOp>;
+                                                                                CDEElementOp,
+                                                                                ComputeDataType>;

        for(std::size_t i = 0; i < gemm_descs.size(); i++)
        {
@@ -253,7 +259,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
            pass &= ck::utils::check_err(c_device_result_converted, c_host_tensors[i]);

 #else
-            pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
+            pass &= ck::utils::check_err<decltype(c_device_tensors[i]),
+                                         decltype(c_host_tensors[i]),
+                                         ComputeDataType>(c_device_tensors[i], c_host_tensors[i]);
 #endif
        }
    }
--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
@@ -8,6 +8,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = INT8;
 using BDataType         = INT8;
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = BF16;
 using BDataType         = BF16;
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = F16;
 using BDataType         = F16;
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = F32;
 using BDataType         = F32;
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using ADataType         = INT4;
 using ADataKernelType   = INT8;
 using BDataType         = INT4;
--- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using ADataType         = INT8;
 using BDataType         = INT8;
 using GemmAccDataType   = INT32;
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = BF16;
 using BDataType         = BF16;
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = F16;
 using BDataType         = F16;
--- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
@@ -7,6 +7,10 @@
 #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // DataType
 using ADataType         = F32;
 using BDataType         = F32;
--- a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
@@ -18,6 +18,10 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
@@ -18,6 +18,10 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 void print_helper_msg()
 {
    std::cout << "arg1: verification (0=no, 1=yes)\n"
--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
+++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
@@ -14,6 +14,10 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using F16 = ck::half_t;
 using F32 = float;

--- a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
+++ b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
@@ -14,6 +14,10 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using F16 = ck::half_t;
 using F32 = float;

--- a/example/19_binary_elementwise/elementwise_add_1d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_1d.cpp
@@ -12,6 +12,10 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using F16 = ck::half_t;
 using F32 = float;

--- a/example/19_binary_elementwise/elementwise_add_4d.cpp
+++ b/example/19_binary_elementwise/elementwise_add_4d.cpp
@@ -14,6 +14,10 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using F16 = ck::half_t;
 using F32 = float;

--- a/example/20_grouped_conv_bwd_weight/common.hpp
+++ b/example/20_grouped_conv_bwd_weight/common.hpp
@@ -20,6 +20,10 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
 using F32  = float;
--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
@@ -18,6 +18,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 // This example demonstrate a single kernel that runs GEMM layer and laynorm in one fused kernel
 //
 // The GEMM + Layernorm implementation is a specialized kernel which allows fusing both layers
--- a/example/22_cgemm/cgemm_xdl_common.hpp
+++ b/example/22_cgemm/cgemm_xdl_common.hpp
@@ -14,6 +14,10 @@
 #include "ck/library/utility/literals.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/23_softmax/softmax_blockwise.cpp
+++ b/example/23_softmax/softmax_blockwise.cpp
@@ -18,6 +18,10 @@
 #include "ck/library/utility/host_common_util.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using namespace ck::tensor_operation::device;

 using InDataType  = ck::half_t;
--- a/example/24_batched_gemm/batched_gemm_xdl_bf16.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_bf16.cpp
@@ -18,6 +18,11 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp
@@ -18,6 +18,11 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp
@@ -18,6 +18,11 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/24_batched_gemm/batched_gemm_xdl_fp16int4_b_scale_v3.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp16int4_b_scale_v3.cpp
@@ -18,6 +18,11 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp
@@ -18,6 +18,11 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp
@@ -18,6 +18,11 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/24_batched_gemm/batched_gemm_xdl_int4.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_int4.cpp
@@ -18,6 +18,11 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/24_batched_gemm/batched_gemm_xdl_int8.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_int8.cpp
@@ -18,6 +18,11 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/utility/literals.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
@@ -19,6 +19,11 @@

 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::make_ParallelTensorFunctor;
+using ::ck::Tensor;
+
 using Row    = ck::tensor_layout::gemm::RowMajor;
 using Bypass = ck::tensor_layout::BypassLayoutVerification;

--- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
@@ -17,6 +17,11 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::make_ParallelTensorFunctor;
+using ::ck::Tensor;
+
 using Row    = ck::tensor_layout::gemm::RowMajor;
 using Bypass = ck::tensor_layout::BypassLayoutVerification;

--- a/example/26_contraction/run_contraction_bilinear_example.inc
+++ b/example/26_contraction/run_contraction_bilinear_example.inc
@@ -15,6 +15,10 @@
 #include "ck/library/utility/numeric.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using Row = ck::tensor_layout::gemm::RowMajor;

 int run_contraction_bilinear_example(int argc, char* argv[])
--- a/example/26_contraction/run_contraction_scale_example.inc
+++ b/example/26_contraction/run_contraction_scale_example.inc
@@ -15,6 +15,10 @@
 #include "ck/library/utility/numeric.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using Row = ck::tensor_layout::gemm::RowMajor;

 int run_contraction_scale_example(int argc, char* argv[])
--- a/example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp
+++ b/example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp
@@ -3,6 +3,10 @@

 #include "common.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using XDataType              = ck::half_t;
 using GammaDataType          = ck::half_t;
 using BetaDataType           = ck::half_t;
--- a/example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp
+++ b/example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp
@@ -3,6 +3,10 @@

 #include "common.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using XDataType              = ck::half_t;
 using GammaDataType          = ck::half_t;
 using BetaDataType           = ck::half_t;
--- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
@@ -18,6 +18,11 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::make_ParallelTensorFunctor;
+using ::ck::Tensor;
+
 using Row    = ck::tensor_layout::gemm::RowMajor;
 using Bypass = ck::tensor_layout::BypassLayoutVerification;

--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
@@ -17,6 +17,11 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::make_ParallelTensorFunctor;
+using ::ck::Tensor;
+
 using Row    = ck::tensor_layout::gemm::RowMajor;
 using Bypass = ck::tensor_layout::BypassLayoutVerification;

--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
@@ -17,6 +17,11 @@
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::make_ParallelTensorFunctor;
+using ::ck::Tensor;
+
 using Row    = ck::tensor_layout::gemm::RowMajor;
 using Bypass = ck::tensor_layout::BypassLayoutVerification;

--- a/example/30_grouped_conv_fwd_multiple_d/common.hpp
+++ b/example/30_grouped_conv_fwd_multiple_d/common.hpp
@@ -24,6 +24,11 @@
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using BF16 = ck::bhalf_t;
 using FP16 = ck::half_t;
 using FP32 = float;
--- a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
+++ b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
@@ -24,6 +24,11 @@
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"

+using ::ck::DeviceMem;
+using ::ck::hip_check_error;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using BF16 = ck::bhalf_t;
 using FP16 = ck::half_t;
 using FP32 = float;
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_bf16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_bf16.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/host_utility/device_prop.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using BF16 = ck::bhalf_t;
 using F32  = float;

--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp16.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/host_utility/device_prop.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using F16 = ck::half_t;
 using F32 = float;

--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp8.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp8.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/host_utility/device_prop.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 using ADataType        = ck::f8_t;
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_int8.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_int8.cpp
@@ -19,6 +19,10 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/host_utility/device_prop.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 using ADataType        = int8_t;
--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
@@ -26,6 +26,10 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
@@ -26,6 +26,10 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
@@ -26,6 +26,10 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
@@ -28,6 +28,10 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
@@ -26,6 +26,10 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp
@@ -29,6 +29,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
 #include "ck/host_utility/device_prop.hpp"

+using ::ck::DeviceMem;
+using ::ck::HostTensorDescriptor;
+using ::ck::Tensor;
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

--- a/Show More
+++ b/Show More