diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4c054b822a..7a21634b7d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -51,6 +51,10 @@ None
 
 None
 
+### Upcoming changes
+
+* Non-grouped convolutions are deprecated. All of their functionality is supported by grouped convolution.
+
 ## Composable Kernel 1.1.0 for ROCm 6.1.0
 
 ### Additions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a3bdaf46b..07d2e166bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,12 @@ add_compile_options(-Wno-pass-failed)
 add_compile_options(-Wno-switch-default)
 add_compile_options(-Wno-unique-object-duplication)
 
+# add -Og -gdwarf64 for debug builds
+add_compile_options(
+    "$<$<CONFIG:Debug>:-Og>"
+    "$<$<CONFIG:Debug>:-gdwarf64>"
+)
+
 # Recent change in compiler makes this warning ON by default, which led to compile errors.
 add_compile_options(-Wno-nrvo)
 
diff --git a/Jenkinsfile b/Jenkinsfile
index e1273db829..35bb91ccaf 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -438,6 +438,34 @@ def cmake_build(Map conf=[:]){
             echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
         }
     }
+    if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
+        try{
+            archiveArtifacts "perf_transpose_*.log"
+            if (arch_type == 1){
+                stash includes: "perf_transpose_**_gfx90a.log", name: "perf_transpose_log_gfx90a"
+            }
+            else if (arch_type == 2){
+                stash includes: "perf_transpose_**_gfx942.log", name: "perf_transpose_log_gfx942"
+            }
+        }
+        catch(Exception err){
+            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
+        }
+    }
+    if (params.RUN_CK_TILE_GEMM_TESTS){
+        try{
+            archiveArtifacts "perf_tile_gemm_**.log"
+            if (arch == 1){
+                stash includes: "perf_tile_gemm_**_gfx90a.log", name: "perf_tile_gemm_log_gfx90a"
+            }
+            else if (arch == 2){
+                stash includes: "perf_tile_gemm_**_gfx942.log", name: "perf_tile_gemm_log_gfx942"
+            }
+        }
+        catch(Exception err){
+            echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
+        }
+    }
 }
 
 def buildHipClangJob(Map conf=[:]){
@@ -734,6 +762,24 @@ def process_results(Map conf=[:]){
                             echo "could not locate the FMHA performance logs: ${err.getMessage()}."
                         }
                     }
+                    if (params.RUN_CK_TILE_TRANSPOSE_TESTS){
+                        try{
+                            unstash "perf_transpose_log_gfx942"
+                            unstash "perf_transpose_log_gfx90a"
+                        }
+                        catch(Exception err){
+                            echo "could not locate the Transpose performance logs: ${err.getMessage()}."
+                        }
+                    }
+                    if (params.RUN_CK_TILE_GEMM_TESTS){
+                        try{
+                            unstash "perf_tile_gemm_log_gfx942"
+                            unstash "perf_tile_gemm_log_gfx90a"
+                        }
+                        catch(Exception err){
+                            echo "could not locate the GEMM performance logs: ${err.getMessage()}."
+                        }
+                    }
                     if (params.RUN_FULL_QA || params.BUILD_INSTANCES_ONLY){
                         // unstash deb packages
                         unstash "packages"
@@ -815,7 +861,7 @@ def run_aiter_tests(Map conf=[:]){
 }
 
 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                               0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -895,6 +941,14 @@ pipeline {
             name: "RUN_CK_TILE_FMHA_TESTS",
             defaultValue: false,
             description: "Run the ck_tile FMHA tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CK_TILE_TRANSPOSE_TESTS",
+            defaultValue: false,
+            description: "Run the ck_tile Transpose tests (default: OFF)")
+        booleanParam(
+            name: "RUN_CK_TILE_GEMM_TESTS",
+            defaultValue: false,
+            description: "Run the ck_tile GEMM tests (default: OFF)")
         booleanParam(
             name: "RUN_TILE_ENGINE_GEMM_TESTS",
             defaultValue: false,
@@ -1144,6 +1198,94 @@ pipeline {
                 }
             }
         }
+        stage("Run CK_TILE_TRANSPOSE Tests")
+        {
+            parallel
+            {
+                stage("Run CK_TILE_TRANSPOSE Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make -j64 tile_example_batched_transpose && \
+                                           cd ../ &&
+                                           example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+                stage("Run CK_TILE_TRANSPOSE Tests on gfx942")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
+                                           make -j64 tile_example_batched_transpose && \
+                                           cd ../ &&
+                                           example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
+        stage("Run CK_TILE_GEMM Tests")
+        {
+            parallel
+            {
+                stage("Run CK_TILE_GEMM Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make -j64 tile_example_gemm_universal && \
+                                           cd ../ &&
+                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+                stage("Run CK_TILE_GEMM Tests on gfx942")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx942 && \
+                                           make -j64 tile_example_gemm_universal && \
+                                           cd ../ &&
+                                           example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
         stage("Run TILE_ENGINE_GEMM Tests")
         {
             parallel
@@ -1350,7 +1492,7 @@ pipeline {
                                            -DGPU_TARGETS="gfx90a" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j 32"""
+                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 9f4c43338e..d1e1a51afd 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -31,7 +31,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
             example_compile_options(example_moe_gemm1_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
             example_compile_options(example_moe_gemm2_xdl_pk_i4 PRIVATE ${EXAMPLE_COMPILE_OPTIONS})
         endif()
-        set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+        set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1")
         example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
         example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
         example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
@@ -39,22 +39,22 @@ foreach(gpu IN LISTS GPU_TARGETS)
     endif()
 endforeach()
 
-set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+set(GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1")
 set(BLOCKSCALE_GEMM_OPTIONS )
 check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP)
 check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION)
 
 if(hip_VERSION_FLAT LESS 600443483 OR hip_VERSION_FLAT GREATER_EQUAL 700000000)
   if(HAS_MISCHED_BOTTOMUP)
-     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
+     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm --misched-bottomup=1")
   elseif(HAS_MISCHED_PRERA_DIRECTION)
-     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup")
+     list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm --misched-prera-direction=bottomup")
   endif()
 else()
   if(HAS_MISCHED_BOTTOMUP)
-    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1")
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --misched-bottomup=1")
   elseif(HAS_MISCHED_PRERA_DIRECTION)
-    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-prera-direction=bottomup")
+    list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --misched-prera-direction=bottomup")
   endif()
 endif()
 
@@ -62,7 +62,6 @@ check_cxx_compiler_flag("-mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupa
 if(HAS_MAX_OCCUPANCY_EXPERIMENTAL)
     list(APPEND BLOCKSCALE_GEMM_OPTIONS -mllvm --amdgpu-sched-strategy=gcn-iterative-max-occupancy-experimental)
 endif()
-# list(APPEND BLOCKSCALE_GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm --misched-bottomup=1")
 example_compile_options(example_gemm_multiply_multiply_xdl_fp8_bpreshuffle PRIVATE ${GEMM_OPTIONS})
 example_compile_options(example_moe_gemm1_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
 example_compile_options(example_moe_gemm2_xdl_fp8 PRIVATE ${GEMM_OPTIONS})
diff --git a/example/67_gemm_microscaling/CMakeLists.txt b/example/67_gemm_microscaling/CMakeLists.txt
index 14b648c9f8..6ee43aac62 100644
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -58,7 +58,7 @@ example_compile_options(example_moe_gemm1_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_M
 example_compile_options(example_moe_gemm2_xdl_mx_fp4_bpreshuffle PRIVATE ${FP4_MXGEMM_OPTIONS})
 
 set(FP8_MXGEMM_OPTIONS)
-list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32")
+list(APPEND FP8_MXGEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1")
 example_compile_options(example_gemm_mx_fp8 PRIVATE ${FP8_MXGEMM_OPTIONS})
 example_compile_options(example_gemm_mx_bf8 PRIVATE ${FP8_MXGEMM_OPTIONS})
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index 77b63a0c83..47cf6b3ad4 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 # generate kernel instances to speed up compilation
 
 import copy
@@ -8,21 +8,13 @@ import fnmatch
 import itertools
 from pathlib import Path
 from typing import List, Optional, Tuple, Dict, Literal
+from collections import defaultdict
 
 from codegen.cmake_config import *
 from codegen.cpp_symbol_map import *
+from codegen.utils import update_file
 
 
-BWD_DQDKDV_PIPELINE_MAP = {
-    "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP",
-    "kr_ktr_vr"      : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR",
-}
-
-BWD_DQDKDV_PIPELINE_ENUM_MAP = {
-    "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP",
-    "kr_ktr_vr"      : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR",
-}
-
 FMHA_BWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
 // auto generated by generate.py
@@ -56,8 +48,8 @@ using fmha_bwd_shape_{F_idx} = ck_tile::TileFmhaBwdShape<fmha_block_tile_{F_idx}
                                                          fmha_block_warps2_{F_idx},
                                                          fmha_warp_tile0_{F_idx}>;
 
-using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
-                                                       {F_skpad},
+using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<false,  /* kPadSeqLenQ */
+                                                       false,  /* kPadSeqLenK */
                                                        {F_dpad},
                                                        {F_dvpad},
                                                        false,
@@ -93,18 +85,18 @@ using fmha_bwd_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdPipelineProblem<
     fmha_dropout_{F_idx},
     fmha_bwd_trait_{F_idx}>;
 
-using fmha_bwd_pipeline_{F_idx} = {F_pipeline}<fmha_bwd_pipeline_problem_{F_idx}>;
+using fmha_bwd_pipeline_{F_idx} = ck_tile::BlockFmhaBwdDQDKDVPipeline<fmha_bwd_pipeline_problem_{F_idx}>;
 
 using fmha_bwd_dk_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
     ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
                                       typename FmhaBwdTypeConfig<{F_dtype}>::KGradDataType,
-                                      {F_skpad},
+                                      false,
                                       {F_dpad}>>;
 
 using fmha_bwd_dv_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
     ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
                                       typename FmhaBwdTypeConfig<{F_dtype}>::VGradDataType,
-                                      {F_skpad},
+                                      false,
                                       {F_dvpad}>>;
 
 using fmha_bwd_dq_dk_dv_kernel_{F_idx} =
@@ -115,13 +107,10 @@ using fmha_bwd_dq_dk_dv_kernel_{F_idx} =
 using dq_dk_dv_trait_{F_idx} = fmha_bwd_dq_dk_dv_traits_<{F_hdim},
                                                          {F_dtype},
                                                          {F_mode},
-                                                         {F_pipeline_enum},
                                                          fmha_mask_{F_idx},
                                                          fmha_dropout_{F_idx},
                                                          {F_bias},
                                                          {F_dbias},
-                                                         {F_spad},
-                                                         {F_skpad},
                                                          {F_dpad},
                                                          {F_dvpad},
                                                          {F_deterministic}>;
@@ -195,15 +184,18 @@ FMHA_BWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
 """
 
 FMHA_BWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) &&
-                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
-                using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dvpad}>;
-                using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_pipeline_enum}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_spad0}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_deterministic}>;
-                using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dpad}, {F_deterministic}>;
+                        ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
+                using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>;
+                using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}>;
+                using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}>;
                 r = fmha_bwd_<dot_do_o_trait_, dq_dk_dv_trait_, convert_dq_trait_>(s, a);
                 return r;
             }}
 """
 
+# M0 size for 1d kernels (dot/convert)
+M0_1D = 64
+
 # GEMM0: Q@K=S^T
 # GEMM1: P^T@dO^T=dV(This was chosen as G1 to match fwd, but N1 must be equal to headdim_v)
 # GEMM2: dO@V=dP^T(This was chosen as G2 because of the calculation order)
@@ -249,8 +241,6 @@ class FmhaBwdDQDKDVKernel:
     F_hdim          : int  # hdim
     F_dtype         : str  # data type
     F_tile          : FmhaBwdDQDKDVTileSize
-    F_spad          : str  # true/false
-    F_skpad         : str  #
     F_dpad          : str  #
     F_dvpad         : str  #
     F_bias          : str  #
@@ -259,7 +249,6 @@ class FmhaBwdDQDKDVKernel:
     F_mask          : str  # value from MASK_MAP
     F_mode          : str  # value from MODE_MAP
     F_deterministic : str  #
-    F_pipeline      : str  #
     mask_impl       : str  #
 
     @property
@@ -293,8 +282,6 @@ class FmhaBwdDQDKDVKernel:
                 F_wm1           = self.F_tile.F_wm1,
                 F_wn1           = self.F_tile.F_wn1,
                 F_wk1           = self.F_tile.F_wk1,
-                F_spad          = BOOL_MAP[self.F_spad],
-                F_skpad         = BOOL_MAP[self.F_skpad],
                 F_dpad          = BOOL_MAP[self.F_dpad],
                 F_dvpad         = BOOL_MAP[self.F_dvpad],
                 F_bias          = BIAS_MAP[self.F_bias],
@@ -304,21 +291,18 @@ class FmhaBwdDQDKDVKernel:
                 F_mask          = get_mask_map(self.mask_impl)[self.F_mask],
                 F_mode          = MODE_MAP[self.F_mode],
                 F_deterministic = BOOL_MAP[self.F_deterministic],
-                F_pipeline_enum = BWD_DQDKDV_PIPELINE_ENUM_MAP[self.F_pipeline],
-                F_pipeline      = BWD_DQDKDV_PIPELINE_MAP[self.F_pipeline])
+            )
 
     @property
     def name(self) -> str:
         def pad_name() -> str:
             n = ''
-            if self.F_spad == 't': n += 's'
-            if self.F_skpad == 't' : n += 'sk'
             if self.F_dpad == 't' : n += 'd'
             if self.F_dvpad == 't' : n += 'dv'
             if n != '' : n = 'p' + n
             return n
         pn = pad_name()
-        n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name + f'_{self.F_pipeline}'
+        n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name
         if pn != '' : n += f'_{pn}'
         else: n += '_npad'
 
@@ -347,20 +331,15 @@ class FmhaBwdDQDKDVKernel:
         return self.name + ".cpp"
 
 # TODO: design a more practical way to do it
-# this is current supported tile size & pipeline.
+# this is current supported tile size.
 def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict]:
     if dtype == 'fp16' or dtype == 'bf16':
         return {
-            '32'  : [FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '64'  : [FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '128' : [FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            # '160' : [FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
-            #             "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '256' : [FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"]
+            '32'  : FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
+            '64'  : FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            '128' : FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            # '160' : FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
+            '256' : FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
         }
     else:
         return None
@@ -375,7 +354,7 @@ using fmha_bwd_dot_do_o_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdOGradDot
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::OGradDataType,
     typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::DDataType,
-    /* BlockSize = */ 64,
+    /* BlockSize = M0 = */ 64,
     {F_hdim},
     {F_mode},
     fmha_bwd_dot_do_o_trait_{F_idx}>;
@@ -580,7 +559,6 @@ class FmhaBwdConvertQGradKernel:
 @dataclass(frozen=True)
 class FmhaBwdApiTrait:
     idx           : int  # this is not a tunable, but a counter to differentiate symbol
-    pipeline      : str
     # sync with fmha_bwd_traits<>, to generate fallback calls
     hdim          : int
     dtype         : str  # data type
@@ -590,9 +568,7 @@ class FmhaBwdApiTrait:
     bias          : str
     dbias         : str
     dropout       : str
-    spad          : str
-    spad1         : str # spad for dot/convert kernel
-    skpad         : str
+    spad1d        : str # spad for 1d kernels (dot/convert)
     dpad          : str
     dvpad         : str
     deterministic : str
@@ -611,24 +587,14 @@ class FmhaBwdApiTrait:
     def bhdv(self) -> int:
         return self.tile.F_bhdv
 
-    def scheck(self, spad1 : str) -> str:
-        if self.mode == 'group':
-            return 'true' # always support
-        elif self.spad == 't' and spad1 == 't':
-            return f'a.seqlen_q % {self.bm0} != 0'
-        elif self.spad == 'f' and spad1 == 't':
-            return f'a.seqlen_q % {self.bm0} == 0 and a.seqlen_q % 64 != 0'
-        else: # self.skpad == 'f' and skpad1 == 'f'
-            return 'a.seqlen_q % 64 == 0'
-
     @property
-    def skcheck(self) -> str:
+    def scheck(self) -> str:
         if self.mode == 'group':
             return 'true' # always support
-        elif self.skpad == 't':
-            return f'a.seqlen_k % {self.bn0} != 0'
-        else:
-            return f'a.seqlen_k % {self.bn0} == 0'
+        elif self.spad1d == 't':
+            return f'a.seqlen_q % {M0_1D} != 0'
+        else: # self.spad1d == 'f'
+            return f'a.seqlen_q % {M0_1D} == 0'
 
     @property
     def dcheck(self) -> str:
@@ -647,14 +613,14 @@ class FmhaBwdApiTrait:
         def get_occupancy(dtype, hdim):
             return 2
 
-        return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1,
+        return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1d,
             F_dvpad=self.dvpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim))
 
     @property
     def dq_dk_dv_kernel(self) -> FmhaBwdDQDKDVKernel:
         return FmhaBwdDQDKDVKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_tile=self.tile,
-            F_spad=self.spad, F_skpad=self.skpad, F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias,
-            F_dbias=self.dbias, F_dropout=self.dropout, F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, F_pipeline=self.pipeline, mask_impl=self.mask_impl)
+            F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias, F_dbias=self.dbias, F_dropout=self.dropout,
+            F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, mask_impl=self.mask_impl)
 
     @property
     def convert_dq_kernel(self) -> FmhaBwdConvertQGradKernel:
@@ -664,48 +630,46 @@ class FmhaBwdApiTrait:
             return 2
 
         return FmhaBwdConvertQGradKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype,
-            F_bm0=64, F_bn0=self.tile.F_bn0, F_spad=self.spad, F_dpad=self.dpad,
+            F_bm0=M0_1D, F_bn0=self.tile.F_bn0, F_spad=self.spad1d, F_dpad=self.dpad,
             F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim),
             F_deterministic=self.deterministic)
 
 class FmhaBwdApiPool:
     def __init__(self, mask_impl):
-        self.dq_dk_dv_pool = dict()
+        self.dq_dk_dv_pool = defaultdict(lambda: defaultdict(list))
         self.mask_impl = mask_impl
 
     def register_dq_dk_dv_traits(self, trait : FmhaBwdApiTrait) -> None:
         # TODO: do we need to check duplication?
-        if trait.dtype not in self.dq_dk_dv_pool.keys():
-            self.dq_dk_dv_pool[trait.dtype] = dict()
-        if trait.hdim not in self.dq_dk_dv_pool[trait.dtype].keys():
-            self.dq_dk_dv_pool[trait.dtype][trait.hdim] = list()
-
         self.dq_dk_dv_pool[trait.dtype][trait.hdim].append(copy.copy(trait))
 
+    @staticmethod
+    def if_(i: int) -> str:
+        return 'if' if i == 0 else 'else if'
+
+    def _api_innders(self, traits: List[FmhaBwdApiTrait]) -> str:
+        inners = ""
+        i = 0 
+        for trait in traits:
+            inners += FMHA_BWD_API_INNER_DISPATCH.format(F_if=self.if_(i), F_mode=MODE_MAP[trait.mode],
+                F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
+                F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
+                F_scheck=trait.scheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=trait.hdim, F_dtype=BWD_DTYPE_MAP[trait.dtype],
+                F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                F_deterministic=BOOL_MAP[trait.deterministic])
+            i += 1
+        return inners
+
     @property
     def api(self) -> str:
         per_dtypes=str()
-        for i, dtype in enumerate(self.dq_dk_dv_pool.keys()):
+        for i, dtype in enumerate(self.dq_dk_dv_pool):
             per_hdim_case=str()
-            for j, hdim in enumerate(self.dq_dk_dv_pool[dtype].keys()):
+            for j, hdim in enumerate(self.dq_dk_dv_pool[dtype]):
                 traits=self.dq_dk_dv_pool[dtype][hdim]
-                inners=str()
-                for k, trait in enumerate(traits):
-                    if_k = 'if' if k == 0 else 'else if'
-                    for spad1 in ["t", "f"]:
-                        if (spad1 == "f" and (trait.spad == "t" or trait.mode == "group")):
-                            continue
-                        inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline],
-                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
-                                    F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
-                                    F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype],
-                                    F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                    F_deterministic=BOOL_MAP[trait.deterministic])
-
-                if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
-            if_i = 'if' if i == 0 else 'else if'
-            per_dtypes = per_dtypes + FMHA_BWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+                inners = self._api_innders(traits)
+                per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=self.if_(j), F_hdim=hdim, F_inner_dispatch=inners)
+            per_dtypes += FMHA_BWD_API_PER_DTYPE.format(F_if=self.if_(i), F_dtype=dtype, F_hdim_case=per_hdim_case)
         if not per_dtypes:
             # empty string we add some ignore to suppress warning in api
             per_dtypes += '    (void)t ; (void)s ; (void)a;'
@@ -730,21 +694,16 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
         d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
         if d is None:
             continue
-        for hdim_str, mode, mask, bias, dbias, dropout, spad, spad1, skpad, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 6)):
-            tile = d[hdim_str][0]
-            ppl = d[hdim_str][1]
+        for hdim_str, mode, mask, bias, dbias, dropout, spad1d, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 4)):
+            tile = d[hdim_str]
             hdim = int(hdim_str)
-            if (mode == "group") and (spad == "f" or skpad == "f"):
-                continue
-            if (spad1 == "f") and (spad == "t" or mode == "group"):
+            if (mode == "group") and (spad1d == "f"):
                 continue
             if ((bias == "no" or bias == "alibi") and dbias == "t"):
                 continue
             if ("wg32" in dropout):
                 continue
-            if (dpad == "t" or dvpad == "t"):
-                ppl = d[hdim_str][2]
-            t = FmhaBwdApiTrait(idx=0, pipeline=ppl, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad=spad, spad1=spad1, skpad=skpad, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl)
+            t = FmhaBwdApiTrait(idx=0, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad1d=spad1d, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl)
 
             if not fnmatch.fnmatch(t.dot_do_o_kernel.name, filter_dot_do_o):
                 continue
@@ -808,13 +767,13 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
 
 def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
     api_pool, kernels_dot_do_o,  kernels_dq_dk_dv,  kernels_convert_dq = get_bwd_blobs(filter_list, receipt, mask_impl, optdim_list)
-    (output_dir / FMHA_BWD_API_FILENAME).write_text(api_pool.api)
+    update_file(output_dir / FMHA_BWD_API_FILENAME, api_pool.api)
     for k in kernels_dot_do_o:
-        (output_dir / k.filename).write_text(k.template)
+        update_file(output_dir / k.filename, k.template)
     for k in kernels_convert_dq:
-        (output_dir / k.filename).write_text(k.template)
+        update_file(output_dir / k.filename, k.template)
     for k in kernels_dq_dk_dv:
-        (output_dir / k.filename).write_text(k.template)
+        update_file(output_dir / k.filename, k.template)
 
 
 def list_blobs(file_path: Path, filter_list: str, receipt, optdim_list, mask_impl) -> None:
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index 730641a6b0..269af4e6a7 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -533,20 +533,31 @@ class KernelComponentFactory:
         pipelines = []
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
-                if bias == "bias":
-                    # TODO: rocm 6.2 compiler problem if using qr_async for bias case
+                if hdim == 256 and hdim_v == 256:
+                # if True:
                     pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                     pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                    # the below two is used for hdim vectorize load
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                     pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
                 else:
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
-                if receipt == 1 and bias != "bias":
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
+                    if bias == "bias":
+                        # TODO: rocm 6.2 compiler problem if using qr_async for bias case
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    else:
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip))
+                    if receipt == 1 and bias != "bias":
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip)) # TODO: cover arbitraty hdim
         elif dtype in ['fp8', 'bf8']:
             # no need lse/dropout kernels
             for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
@@ -584,7 +595,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
                     if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                         # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
                         continue
-                if (hdim, hdim_v) == (192, 128) or hdim == 160:
+                if (hdim, hdim_v) == (192, 128):
                     # NOTE: this is used to speedup deepseek prefill case, we don't gen training
                     if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                         continue
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 5b35e7f0bd..0e4ac44d45 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -41,7 +41,6 @@ K0_MAX_SUBMAX_MAP = {
 FMHA_FWD_SPLITKV_PIPELINE_MAP = {
     "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS",
     "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS",
-    "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync",
 }
 
 FMHA_FWD_SPLITKV_KERNEL_BODY="""
@@ -685,28 +684,17 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, opt
         pipelines = []
         if dtype in ['fp16', 'bf16']:
             for logits, mask, bias, pagedkv in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
-                # TODO: use async pipeline when compiler is more stable
-                if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128, 160]:
-                # if True:
-                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
 
-                    pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
 
-                    pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
 
-                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                else:
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
-                    if receipt == 1:
-                        pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
-                        pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', logits, bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
+                pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
         elif dtype in ['fp8', 'bf8']:
             for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
                 pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 't', squant, 'f', mask))
diff --git a/example/ck_tile/01_fmha/codegen/utils.py b/example/ck_tile/01_fmha/codegen/utils.py
new file mode 100644
index 0000000000..e3bbb18c42
--- /dev/null
+++ b/example/ck_tile/01_fmha/codegen/utils.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import os.path as path
+
+
+def update_file(file_path, content):
+    """Update the file at file_path with the given content if it differs from the existing content.
+
+    It avoids unnecessary touching of the file which triggers rebuilds
+    """
+
+    existing_content = ""
+    if path.exists(file_path):
+        with open(file_path, "r") as file:
+            existing_content = file.read()
+    if existing_content == content:
+        return
+    with open(file_path, "w") as file:
+        file.write(content)
diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp
index 9179dbd9be..c999cf750e 100644
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -357,31 +357,25 @@ auto fmha_bwd_convert_dq_create_kargs_and_grids(fmha_bwd_args args)
 template <ck_tile::index_t HDim_,
           typename DataType_,
           bool kIsGroupMode_,
-          ck_tile::BlockFmhaBwdPipelineEnum FmhaBwdPipelineEnum_,
           typename FmhaMask_,
           typename FmhaDropout_,
           ck_tile::BlockAttentionBiasEnum BiasEnum_,
           bool kHasBiasGrad_,
-          bool kPadS_,
-          bool kPadSK_,
           bool kPadD_,
           bool kPadDv_,
           bool kIsDeterministic_>
 struct fmha_bwd_dq_dk_dv_traits_
 {
-    static constexpr ck_tile::index_t HDim    = HDim_;
-    using DataType                            = ck_tile::remove_cvref_t<DataType_>;
-    static constexpr bool kIsGroupMode        = kIsGroupMode_;
-    static constexpr auto FmhaBwdPipelineEnum = FmhaBwdPipelineEnum_;
-    using FmhaMask                            = ck_tile::remove_cvref_t<FmhaMask_>;
-    using FmhaDropout                         = ck_tile::remove_cvref_t<FmhaDropout_>;
-    static constexpr auto BiasEnum            = BiasEnum_;
-    static constexpr bool kHasBiasGrad        = kHasBiasGrad_;
-    static constexpr bool kPadS               = kPadS_;
-    static constexpr bool kPadSK              = kPadSK_;
-    static constexpr bool kPadD               = kPadD_;
-    static constexpr bool kPadDv              = kPadDv_;
-    static constexpr bool kIsDeterministic    = kIsDeterministic_;
+    static constexpr ck_tile::index_t HDim = HDim_;
+    using DataType                         = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode     = kIsGroupMode_;
+    using FmhaMask                         = ck_tile::remove_cvref_t<FmhaMask_>;
+    using FmhaDropout                      = ck_tile::remove_cvref_t<FmhaDropout_>;
+    static constexpr auto BiasEnum         = BiasEnum_;
+    static constexpr bool kHasBiasGrad     = kHasBiasGrad_;
+    static constexpr bool kPadD            = kPadD_;
+    static constexpr bool kPadDv           = kPadDv_;
+    static constexpr bool kIsDeterministic = kIsDeterministic_;
 };
 
 template <typename Traits_>
diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt
index e6f67e4c76..b1aede42c7 100644
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -10,7 +10,7 @@ list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion
 list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef)
 list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker)
 list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps)
-list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-32 -mllvm -enable-noalias-to-md-conversion=0")
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0")
 target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp
index 602661f779..cf816caa88 100644
--- a/example/ck_tile/05_reduce/reduce.cpp
+++ b/example/ck_tile/05_reduce/reduce.cpp
@@ -1,16 +1,21 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include "ck_tile/host.hpp"
-#include "reduce.hpp"
+#include "ck_tile/ops/reduce.hpp"
 #include <cstring>
 
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3328", "m dimension")
-        .insert("n", "4096", "n dimension")
+    arg_parser.insert("n", "32", "n dimension")
+        .insert("h", "7", "h dimension")
+        .insert("w", "7", "w dimension")
+        .insert("c", "512", "c dimension")
         .insert("v", "1", "cpu validation or not")
         .insert("prec", "fp16", "precision")
-        .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
+        .insert("warmup", "0", "cold iter")
+        .insert("repeat", "1", "hot iter");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -23,15 +28,28 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using ComputeDataType = float;
     using YDataType       = DataType;
 
-    ck_tile::index_t m = arg_parser.get_int("m");
-    ck_tile::index_t n = arg_parser.get_int("n");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t H = arg_parser.get_int("h");
+    ck_tile::index_t W = arg_parser.get_int("w");
+    ck_tile::index_t C = arg_parser.get_int("c");
     int do_validation  = arg_parser.get_int("v");
     int warmup         = arg_parser.get_int("warmup");
     int repeat         = arg_parser.get_int("repeat");
 
-    ck_tile::HostTensor<XDataType> x_host({m, n});
-    ck_tile::HostTensor<YDataType> y_host_ref({m});
-    ck_tile::HostTensor<YDataType> y_host_dev({m});
+    std::vector<ck_tile::index_t> problem_shape = {N, H, W, C};
+    std::vector<ck_tile::index_t> strides(4);
+    strides[0] = H * W * C;
+    strides[1] = W * C;
+    strides[2] = C;
+    strides[3] = 1;
+
+    // Define reduction specification:
+    constexpr auto kept_dim    = ck_tile::sequence<0, 3>{}; // Which dimension to keep
+    constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce
+
+    ck_tile::HostTensor<XDataType> x_host(problem_shape, strides);
+    ck_tile::HostTensor<YDataType> y_host_ref({N, C}, {C, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({N, C}, {C, 1});
 
     ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);
 
@@ -54,7 +72,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     constexpr ck_tile::index_t kBlockSize  = 256;
     constexpr ck_tile::index_t kBlockPerCu = 1;
-    ck_tile::index_t kGridSize             = (m / BlockTile::at(ck_tile::number<0>{}));
+    ck_tile::index_t kept_dim_len_prod     = N * C;
+    ck_tile::index_t kGridSize = (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) /
+                                 BlockTile::at(ck_tile::number<0>{});
     std::cout << "grid size " << kGridSize << std::endl;
 
     using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, Vector>;
@@ -63,6 +83,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     using Kernel = ck_tile::Reduce<Porblem>;
 
+    // Create input tensor shape and strides
+    auto input_shape =
+        ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
+    auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]);
+
+    if(!Kernel::IsSupportedArgument(
+           C, input_strides)) // output tensor's continuous dimension and input strides
+    {
+        throw std::runtime_error("Wrong! Arguments not supported!\n");
+    }
+
     float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
                                    ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
                                        Kernel{},
@@ -71,10 +102,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                        0,
                                        static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
                                        static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
-                                       m,
-                                       n));
+                                       input_shape,
+                                       input_strides,
+                                       kept_dim,
+                                       reduce_dims));
 
-    std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m;
+    std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C;
 
     float gb_per_sec = num_btype / 1.E6 / ave_time;
 
@@ -86,7 +119,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
     {
         // reference
         ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
-            x_host, y_host_ref, ReduceOp{});
+            x_host, y_host_ref, ReduceOp{}, kept_dim, reduce_dims);
         y_buf.FromDevice(y_host_dev.mData.data());
         pass = ck_tile::check_err(y_host_dev, y_host_ref);
 
diff --git a/example/ck_tile/05_reduce/reduce.hpp b/example/ck_tile/05_reduce/reduce.hpp
deleted file mode 100644
index 6fbb0b4274..0000000000
--- a/example/ck_tile/05_reduce/reduce.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
-#include "ck_tile/ops/reduce/block/block_reduce.hpp"
-#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
-
-namespace ck_tile {
-
-template <typename BlockWarps, // num warps along seq<M, N>
-          typename BlockTile,  // block size, seq<M, N>
-          typename WarpTile,   // warp size, seq<M, N>
-          typename Vector>     // contiguous pixels(vector size) along seq<M, N>
-struct Reduce2dShape
-{
-    static constexpr index_t Block_M = BlockTile::at(number<0>{});
-    static constexpr index_t Block_N = BlockTile::at(number<1>{});
-
-    static constexpr index_t Warp_M = WarpTile::at(number<0>{});
-    static constexpr index_t Warp_N = WarpTile::at(number<1>{});
-
-    static constexpr index_t Vector_M = Vector::at(number<0>{});
-    static constexpr index_t Vector_N = Vector::at(number<1>{});
-
-    static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
-    static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
-
-    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
-
-    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
-    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
-
-    static constexpr index_t BlockSize =
-        ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
-};
-
-template <typename XDataType_,
-          typename ComputeDataType_,
-          typename YDataType_,
-          typename BlockShape_,
-          typename ReduceOp_>
-struct Reduce2dProblem
-{
-    using XDataType       = remove_cvref_t<XDataType_>;
-    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
-    using YDataType       = remove_cvref_t<YDataType_>;
-    using BlockShape      = remove_cvref_t<BlockShape_>;
-    using ReduceOp        = ReduceOp_;
-
-    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
-    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
-};
-
-template <typename Problem_, typename Policy_ = BlockReduce2dDefaultPolicy>
-struct Reduce
-{
-    using Problem = ck_tile::remove_cvref_t<Problem_>;
-    using Policy  = ck_tile::remove_cvref_t<Policy_>;
-
-    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
-    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
-    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
-
-#if 0
-    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N)
-    const
-    {
-        using S = typename Problem::BlockShape;
-
-        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
-
-        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
-            p_y, make_tuple(M), number<1>{});
-
-        const auto iM = get_block_id() * S::Block_M;
-
-        auto x_window = make_tile_window(x_m_n,
-                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
-                                         {iM, 0},
-                                         Policy::template MakeXBlockTileDistribution<Problem>());
-
-        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
-
-        const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; };
-
-        const XDataType reduce_init_value = 0;
-
-        constexpr auto reduce_dims = sequence<1>{};
-
-        auto y_compute = decltype(block_tile_reduce<ComputeDataType>(
-            load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){};
-
-        set_tile(y_compute, reduce_init_value);
-
-        index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
-
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
-        {
-            const auto x = load_tile(x_window);
-            block_tile_reduce(y_compute, x, reduce_dims, f_reduce);
-            move_tile_window(x_window, {0, S::Block_N});
-        }
-
-        block_tile_reduce_sync(y_compute, f_reduce);
-
-        store_tile(y_window, cast_tile<YDataType>(y_compute));
-    }
-#else
-    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) const
-    {
-        using S = typename Problem::BlockShape;
-
-        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
-
-        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
-            p_y, make_tuple(M), number<1>{});
-
-        const auto iM = get_block_id() * S::Block_M;
-
-        auto x_window = make_tile_window(x_m_n,
-                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
-                                         {iM, 0},
-                                         Policy::template MakeXBlockTileDistribution<Problem>());
-
-        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
-
-        __shared__ char smem[Policy::template GetSmemSize<Problem>()];
-
-        index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
-
-        auto reduce_func         = typename Problem::ReduceOp{};
-        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
-        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
-        auto block_reduce2d_cross_warp_sync =
-            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
-
-        using XTensorType = decltype(load_tile(x_window));
-        auto y_compute    = block_reduce2d.template MakeYBlockTile<XTensorType>();
-        set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
-
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
-        {
-            const auto x = load_tile(x_window);
-            block_reduce2d(x, y_compute, reduce_func);
-            move_tile_window(x_window, {0, S::Block_N});
-        }
-
-        block_reduce2d_sync(y_compute, reduce_func);
-        block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func);
-
-        store_tile(y_window, cast_tile<YDataType>(y_compute));
-    }
-#endif
-};
-
-} // namespace ck_tile
diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
index 79df4e624d..475c13166d 100644
--- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt
+++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
@@ -1,2 +1 @@
 add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)
-add_executable(tile_example_grouped_gemm_tileloop EXCLUDE_FROM_ALL grouped_gemm_tileloop.cpp)
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
index bb0a0d5840..897952f03c 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <hip/hip_runtime.h>
 
@@ -16,19 +16,11 @@
 #include "ck_tile/host.hpp"
 #include "grouped_gemm.hpp"
 
-template <typename ADataType,
-          typename BDataType,
-          typename DsDataType,
-          typename AccDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename DsLayout,
-          typename CLayout,
-          typename CDEElementWise = ck_tile::element_wise::PassThrough>
-float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
-                   const ck_tile::stream_config& s,
-                   void* kargs_ptr)
+template <typename ALayout, typename BLayout, typename CLayout>
+float grouped_gemm_tileloop(const ck_tile::stream_config& s,
+                            const ck_tile::index_t num_groups,
+                            void* kargs_ptr,
+                            bool splitk)
 {
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
     // Memory friendly for Interwave scheduler
@@ -83,8 +75,6 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
     constexpr bool kPadN = false;
     constexpr bool kPadK = false;
 
-    constexpr bool TransposeC = false;
-
     constexpr int kBlockPerCu                         = 1;
     constexpr ck_tile::index_t TileParitionerGroupNum = 8;
     constexpr ck_tile::index_t TileParitionerM01      = 4;
@@ -97,54 +87,41 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
         GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
 
     using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
-                                                                 kPadN,
-                                                                 kPadK,
-                                                                 DoubleSmemBuffer,
-                                                                 ALayout,
-                                                                 BLayout,
-                                                                 CLayout,
-                                                                 TransposeC>;
+    using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits<kPadM,
+                                                                           kPadN,
+                                                                           kPadK,
+                                                                           DoubleSmemBuffer,
+                                                                           ALayout,
+                                                                           BLayout,
+                                                                           CLayout>;
     using GemmPipelineProblem =
         ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
 
-    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
-
-    const ck_tile::index_t k_grain     = gemm_descs[0].k_batch * K_Tile;
-    const ck_tile::index_t K_split     = (gemm_descs[0].K + k_grain - 1) / k_grain * K_Tile;
-    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-
     float ave_time{0};
 
-    const auto Run = [&](const auto has_hot_loop_,
-                         const auto tail_number_,
-                         const auto memory_operation_) {
-        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-        constexpr auto tail_number_v    = tail_number_.value;
+    const auto Run = [&](const auto memory_operation_) {
         constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
         constexpr auto memory_operation = memory_operation_.value;
 
+        // We create the GEMM pipeline without specifying hotloop or tailnumber.
+        // These are automatically run inside the kernel based on the given input data.
         using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                            BDataType,
                                                                            AccDataType,
                                                                            GemmShape,
                                                                            GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
+                                                                           scheduler>;
 
         using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
         using GemmEpilogue = ck_tile::CShuffleEpilogue<
             ck_tile::CShuffleEpilogueProblem<ADataType,
                                              BDataType,
-                                             DsDataType,
+                                             ck_tile::tuple<>,
                                              AccDataType,
                                              CDataType,
-                                             DsLayout,
+                                             ck_tile::tuple<>,
                                              CLayout,
-                                             CDEElementWise,
+                                             ck_tile::element_wise::PassThrough,
                                              GemmPipelineProblem::kBlockSize,
                                              TilePartitioner::MPerBlock,
                                              TilePartitioner::NPerBlock,
@@ -156,20 +133,8 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                                              UniversalGemmProblem::TransposeC,
                                              memory_operation>>;
         using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKargs(gemm_descs);
-        if(!Kernel::IsSupportedArgument(kargs))
-        {
-            throw std::runtime_error("Kernel arguments not supported!");
-        }
-
         constexpr dim3 blocks = Kernel::BlockSize();
-        const dim3 grids      = Kernel::GridSize(gemm_descs);
-
-        HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
-                                            kargs.data(),
-                                            get_workspace_size(gemm_descs),
-                                            hipMemcpyHostToDevice,
-                                            s.stream_id_));
+        const dim3 grids      = Kernel::MaxOccupancyGridSize(s);
 
         if(s.log_level_ > 0)
         {
@@ -186,45 +151,26 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                                        blocks,
                                        0,
                                        ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
-                                       gemm_descs.size()));
+                                       num_groups));
 
         return ave_time;
     };
 
-    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
-        if(gemm_descs[0].k_batch == 1)
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::set>{});
-        }
-        else
-        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::atomic_add>{});
-        }
-    };
-
-    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    if(!splitk)
+    {
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::set>{});
+    }
+    else
+    {
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::atomic_add>{});
+    }
 
     return ave_time;
 }
 
 #include "run_grouped_gemm_example.inc"
 
-constexpr bool Persistent = false;
-int main(int argc, char* argv[])
-{
-    try
-    {
-        return !run_grouped_gemm_example<Persistent>(argc, argv);
-    }
-    catch(const std::runtime_error& e)
-    {
-        std::cerr << "Runtime error: " << e.what() << '\n';
-        return EXIT_FAILURE;
-    }
-}
+constexpr bool Persistent = true;
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example<Persistent>(argc, argv); }
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
index 74efb1bdeb..89d91fbef6 100644
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -15,7 +15,7 @@
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 
 #ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V4
 #endif
 
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
deleted file mode 100644
index 897952f03c..0000000000
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <hip/hip_runtime.h>
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <string>
-#include <tuple>
-#include <memory>
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/epilogue.hpp"
-#include "ck_tile/ops/gemm.hpp"
-#include "ck_tile/host.hpp"
-#include "grouped_gemm.hpp"
-
-template <typename ALayout, typename BLayout, typename CLayout>
-float grouped_gemm_tileloop(const ck_tile::stream_config& s,
-                            const ck_tile::index_t num_groups,
-                            void* kargs_ptr,
-                            bool splitk)
-{
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-    // Memory friendly for Interwave scheduler
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 32;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 4;
-    constexpr ck_tile::index_t N_Warp = 1;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
-
-    constexpr bool DoubleSmemBuffer = false;
-#endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-    // Compute friendly for Intrawave scheduler
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = false;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = true;
-#endif
-
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
-
-    constexpr int kBlockPerCu                         = 1;
-    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
-    constexpr ck_tile::index_t TileParitionerM01      = 4;
-
-    using GemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
-    using TilePartitioner = ck_tile::
-        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
-
-    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits<kPadM,
-                                                                           kPadN,
-                                                                           kPadK,
-                                                                           DoubleSmemBuffer,
-                                                                           ALayout,
-                                                                           BLayout,
-                                                                           CLayout>;
-    using GemmPipelineProblem =
-        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
-
-    float ave_time{0};
-
-    const auto Run = [&](const auto memory_operation_) {
-        constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
-        constexpr auto memory_operation = memory_operation_.value;
-
-        // We create the GEMM pipeline without specifying hotloop or tailnumber.
-        // These are automatically run inside the kernel based on the given input data.
-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler>;
-
-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             ck_tile::tuple<>,
-                                             AccDataType,
-                                             CDataType,
-                                             ck_tile::tuple<>,
-                                             CLayout,
-                                             ck_tile::element_wise::PassThrough,
-                                             GemmPipelineProblem::kBlockSize,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC,
-                                             memory_operation>>;
-        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        constexpr dim3 blocks = Kernel::BlockSize();
-        const dim3 grids      = Kernel::MaxOccupancyGridSize(s);
-
-        if(s.log_level_ > 0)
-        {
-            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {"
-                      << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {"
-                      << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl;
-        }
-
-        ave_time =
-            ck_tile::launch_kernel(s,
-                                   ck_tile::make_kernel<blocks.x, kBlockPerCu>(
-                                       Kernel{},
-                                       grids,
-                                       blocks,
-                                       0,
-                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
-                                       num_groups));
-
-        return ave_time;
-    };
-
-    if(!splitk)
-    {
-        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                       ck_tile::memory_operation_enum::set>{});
-    }
-    else
-    {
-        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                       ck_tile::memory_operation_enum::atomic_add>{});
-    }
-
-    return ave_time;
-}
-
-#include "run_grouped_gemm_example.inc"
-
-constexpr bool Persistent = true;
-int main(int argc, char* argv[]) { return !run_grouped_gemm_example<Persistent>(argc, argv); }
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index b57ae22172..089d4c2a9d 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -562,6 +562,58 @@ struct NormalizeInInfer
     double epsilon_;
 };
 
+// used by Conv+Bias+BatchNorm+Clamp inference
+struct BiasNormalizeInInferClamp
+{
+    BiasNormalizeInInferClamp(float floor   = 0.f,
+                              float ceil    = NumericLimits<float>::Max(),
+                              float epsilon = 1e-4)
+        : clamp_(floor, ceil), epsilon_(epsilon)
+    {
+    }
+
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y,
+                                                  const T& x,
+                                                  const T& bias,
+                                                  const T& mean,
+                                                  const T& variance,
+                                                  const T& gamma,
+                                                  const T& beta) const
+    {
+        using ck::type_convert;
+        using ck::math::sqrt;
+
+        float tmp_x = type_convert<float>(x) + type_convert<float>(bias);
+
+        float tmp_y =
+            ((tmp_x - type_convert<float>(mean)) / sqrt(type_convert<float>(variance) + epsilon_)) *
+                type_convert<float>(gamma) +
+            type_convert<float>(beta);
+        clamp_(tmp_y, tmp_y);
+        y = type_convert<T>(tmp_y);
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()(float& y,
+                                                  const float& x,
+                                                  const float& bias,
+                                                  const float& mean,
+                                                  const float& variance,
+                                                  const float& gamma,
+                                                  const float& beta) const
+    {
+        using ck::type_convert;
+        using ck::math::sqrt;
+
+        float tmp_y = (((x + bias) - mean) / sqrt(variance + epsilon_)) * gamma + beta;
+        clamp_(y, tmp_y);
+    };
+
+    Clamp clamp_;
+    float epsilon_;
+};
+
 template <typename Y, typename X>
 struct UnaryTypeConvert;
 
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index 188cebaabc..9f3c996873 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -27,6 +27,7 @@
 #include "ck_tile/core/container/thread_buffer.hpp"
 #include "ck_tile/core/container/tuple.hpp"
 #include "ck_tile/core/numeric/bfloat16.hpp"
+#include "ck_tile/core/numeric/e8m0.hpp"
 #include "ck_tile/core/numeric/float8.hpp"
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/int8.hpp"
@@ -74,6 +75,7 @@
 #include "ck_tile/core/utility/literals.hpp"
 #include "ck_tile/core/utility/magic_div.hpp"
 #include "ck_tile/core/utility/philox_rand.hpp"
+#include "ck_tile/core/utility/print.hpp"
 #include "ck_tile/core/utility/random.hpp"
 #include "ck_tile/core/utility/reduce_operator.hpp"
 #include "ck_tile/core/utility/static_counter.hpp"
diff --git a/include/ck_tile/core/algorithm/coordinate_transform.hpp b/include/ck_tile/core/algorithm/coordinate_transform.hpp
index f7f9489f4c..7511413bba 100644
--- a/include/ck_tile/core/algorithm/coordinate_transform.hpp
+++ b/include/ck_tile/core/algorithm/coordinate_transform.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 #include "ck_tile/core/utility/magic_div.hpp"
+#include "ck_tile/core/utility/print.hpp"
 
 namespace ck_tile {
 
@@ -139,20 +140,19 @@ struct pass_through : public base_transform<1, 1>
     {
         return make_tuple(low_vector_lengths, low_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("pass_through{");
-
-        //
-        printf("up_lengths_:");
-        print(up_lengths_);
-
-        //
-        printf("}");
-    }
 };
 
+template <typename LowLength>
+CK_TILE_HOST_DEVICE static void print(const pass_through<LowLength>& pt)
+{
+    printf("pass_through{");
+
+    printf("up_lengths_: ");
+    print(pt.get_upper_lengths());
+
+    printf("}");
+}
+
 template <typename LowLength,
           typename LeftPadLength,
           typename RightPadLength,
@@ -229,29 +229,25 @@ struct pad : public base_transform<1, 1>
                ck_tile::is_known_at_compile_time<LeftPadLength>::value &&
                ck_tile::is_known_at_compile_time<RightPadLength>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("pad{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("left_pad_length_: ");
-        print(left_pad_length_);
-        printf(", ");
-
-        //
-        printf("right_pad_length_: ");
-        print(right_pad_length_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLength,
+          typename LeftPadLength,
+          typename RightPadLength,
+          bool SkipIsValidCheck>
+CK_TILE_HOST_DEVICE static void
+print(const pad<LowLength, LeftPadLength, RightPadLength, SkipIsValidCheck>& p)
+{
+    printf("pad{");
+    printf("up_lengths_: ");
+    print(p.up_lengths_);
+    printf(", left_pad_length_: ");
+    print(p.left_pad_length_);
+    printf(", right_pad_length_: ");
+    print(p.right_pad_length_);
+    printf("}");
+}
+
 template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
 struct left_pad
 {
@@ -330,24 +326,20 @@ struct left_pad
         //       It's up to runtime to check the padding length should be multiple of vector length
         return make_tuple(low_vector_lengths, low_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("left_pad{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("left_pad_length_: ");
-        print(left_pad_length_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck>
+CK_TILE_HOST_DEVICE static void
+print(const left_pad<LowLength, LeftPadLength, SkipIsValidCheck>& lp)
+{
+    printf("left_pad{");
+    printf("up_lengths_: ");
+    print(lp.up_lengths_);
+    printf(", left_pad_length_: ");
+    print(lp.left_pad_length_);
+    printf("}");
+}
+
 template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
 struct right_pad : public base_transform<1, 1>
 {
@@ -430,24 +422,20 @@ struct right_pad : public base_transform<1, 1>
         //       It's up to runtime to check the padding length should be multiple of vector length
         return make_tuple(low_vector_lengths, low_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("right_pad{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("right_pad_length_: ");
-        print(right_pad_length_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck>
+CK_TILE_HOST_DEVICE static void
+print(const right_pad<LowLength, RightPadLength, SkipIsValidCheck>& rp)
+{
+    printf("right_pad{");
+    printf("up_lengths_: ");
+    print(rp.up_lengths_);
+    printf(", right_pad_length_: ");
+    print(rp.right_pad_length_);
+    printf("}");
+}
+
 // idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1]
 // UpLengths and Coefficients can be either of the followings:
 //   1) Tuple of index_t, which is known at run-time, or
@@ -532,24 +520,19 @@ struct embed : public base_transform<1, UpLengths::size()>
         return ck_tile::is_known_at_compile_time<UpLengths>::value &&
                ck_tile::is_known_at_compile_time<Coefficients>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("embed{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("coefficients_: ");
-        print(coefficients_);
-
-        printf("}");
-    }
 };
 
+template <typename UpLengths, typename Coefficients>
+CK_TILE_HOST_DEVICE static void print(const embed<UpLengths, Coefficients>& e)
+{
+    printf("embed{");
+    printf("up_lengths_: ");
+    print(e.up_lengths_);
+    printf(", coefficients_: ");
+    print(e.coefficients_);
+    printf("}");
+}
+
 template <typename LowLengths>
 struct lambda_merge_generate_MagicDivision_calculate_magic_divisor
 {
@@ -699,24 +682,19 @@ struct merge_v2_magic_division : public base_transform<LowLengths::size(), 1>
 
         return make_tuple(up_vector_lengths, up_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("merge_v2_magic_division{");
-
-        //
-        printf("low_lengths_ ");
-        print(low_lengths_);
-        printf(", ");
-
-        //
-        printf("up_lengths_ ");
-        print(up_lengths_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLengths>
+CK_TILE_HOST_DEVICE static void print(const merge_v2_magic_division<LowLengths>& m)
+{
+    printf("merge_v2_magic_division{");
+    printf("low_lengths_: ");
+    print(m.low_lengths_);
+    printf(", up_lengths_: ");
+    print(m.up_lengths_);
+    printf("}");
+}
+
 // Implementation of "merge" transformation primitive that uses division and mod. It is supposed to
 // be used for low_lengths that are known at compile time and are power of 2, otherwise performance
 // will be very bad
@@ -830,29 +808,21 @@ struct merge_v3_division_mod : public base_transform<LowLengths::size(), 1>
 
         return make_tuple(up_vector_lengths, up_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("Merge_v3_direct_division_mod{");
-
-        //
-        printf("low_lengths_ ");
-        print(low_lengths_);
-        printf(", ");
-
-        //
-        printf("low_lengths_scan_ ");
-        print(low_lengths_scan_);
-        printf(", ");
-
-        //
-        printf("up_lengths_ ");
-        print(up_lengths_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLengths>
+CK_TILE_HOST_DEVICE static void print(const merge_v3_division_mod<LowLengths>& m)
+{
+    printf("merge_v3_division_mod{");
+    printf("low_lengths_: ");
+    print(m.low_lengths_);
+    printf(", low_lengths_scan_: ");
+    print(m.low_lengths_scan_);
+    printf(", up_lengths_: ");
+    print(m.up_lengths_);
+    printf("}");
+}
+
 template <typename UpLengths, bool Use24BitIntegerCalculation>
 struct unmerge : public base_transform<1, UpLengths::size()>
 {
@@ -958,24 +928,19 @@ struct unmerge : public base_transform<1, UpLengths::size()>
 
         return make_tuple(up_vector_lengths, up_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("unmerge{");
-
-        //
-        printf("up_lengths_");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("up_lengths_scan_");
-        print(up_lengths_scan_);
-
-        printf("}");
-    }
 };
 
+template <typename UpLengths, bool Use24BitIntegerCalculation>
+CK_TILE_HOST_DEVICE static void print(const unmerge<UpLengths, Use24BitIntegerCalculation>& u)
+{
+    printf("unmerge{");
+    printf("up_lengths_: ");
+    print(u.up_lengths_);
+    printf(", up_lengths_scan_: ");
+    print(u.up_lengths_scan_);
+    printf("}");
+}
+
 template <typename LowerIndex>
 struct freeze : public base_transform<1, 0>
 {
@@ -1023,19 +988,17 @@ struct freeze : public base_transform<1, 0>
     {
         return ck_tile::is_known_at_compile_time<LowerIndex>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("freeze{");
-
-        //
-        printf("low_idx_: ");
-        print(low_idx_);
-
-        printf("}");
-    }
 };
 
+template <typename LowerIndex>
+CK_TILE_HOST_DEVICE static void print(const freeze<LowerIndex>& f)
+{
+    printf("freeze{");
+    printf("low_idx_: ");
+    print(f.low_idx_);
+    printf("}");
+}
+
 // insert a dangling upper dimension without lower dimension
 template <typename UpperLength>
 struct insert : public base_transform<0, 1>
@@ -1092,18 +1055,17 @@ struct insert : public base_transform<0, 1>
     {
         return ck_tile::is_known_at_compile_time<UpperLength>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("insert{");
-
-        //
-        print(up_lengths_);
-
-        printf("}");
-    }
 };
 
+template <typename UpperLength>
+CK_TILE_HOST_DEVICE static void print(const insert<UpperLength>& i)
+{
+    printf("insert{");
+    printf("up_lengths_: ");
+    print(i.up_lengths_);
+    printf("}");
+}
+
 // replicate the original tensor and create a higher dimensional tensor
 template <typename UpLengths>
 struct replicate : public base_transform<0, UpLengths::size()>
@@ -1152,21 +1114,19 @@ struct replicate : public base_transform<0, UpLengths::size()>
         return ck_tile::is_known_at_compile_time<UpLengths>::value;
     }
 
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("replicate{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-
-        printf("}");
-    }
-
     //
     UpLengths up_lengths_;
 };
 
+template <typename UpLengths>
+CK_TILE_HOST_DEVICE static void print(const replicate<UpLengths>& r)
+{
+    printf("replicate{");
+    printf("up_lengths_: ");
+    print(r.up_lengths_);
+    printf("}");
+}
+
 template <typename LowLength, typename SliceBegin, typename SliceEnd>
 struct slice : public base_transform<1, 1>
 {
@@ -1238,28 +1198,20 @@ struct slice : public base_transform<1, 1>
                ck_tile::is_known_at_compile_time<SliceBegin>::value &&
                ck_tile::is_known_at_compile_time<SliceEnd>::value;
     }
+};
 
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("slice{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("slice_begin_: ");
-        print(slice_begin_);
-        printf(", ");
-
-        //
-        printf("slice_end_: ");
-        print(slice_end_);
-
-        printf("}");
-    } // namespace ck
-}; // namespace ck
+template <typename LowLength, typename SliceBegin, typename SliceEnd>
+CK_TILE_HOST_DEVICE static void print(const slice<LowLength, SliceBegin, SliceEnd>& s)
+{
+    printf("slice{");
+    printf("up_lengths_: ");
+    print(s.up_lengths_);
+    printf(", slice_begin_: ");
+    print(s.slice_begin_);
+    printf(", slice_end_: ");
+    print(s.slice_end_);
+    printf("}");
+}
 
 /*
  * \brief lower_idx = upper_idx % modulus.
@@ -1328,19 +1280,19 @@ struct modulo : public base_transform<1, 1>
     {
         return ck_tile::is_known_at_compile_time<UpLengths>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("Modulus{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-
-        printf("}");
-    }
 };
 
+template <typename Modulus, typename UpLength>
+CK_TILE_HOST_DEVICE static void print(const modulo<Modulus, UpLength>& m)
+{
+    printf("modulo{");
+    printf("modulus_: ");
+    print(m.modulus_);
+    printf(", up_lengths_: ");
+    print(m.up_lengths_);
+    printf("}");
+}
+
 // 2D XOR, NOTE: "xor" is a keyword
 template <typename LowLengths>
 struct xor_t : public base_transform<2, 2>
@@ -1424,20 +1376,17 @@ struct xor_t : public base_transform<2, 2>
 
         return make_tuple(up_vector_lengths, up_vector_strides);
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("xor_t{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        printf("}");
-    }
 };
 
+template <typename LowLengths>
+CK_TILE_HOST_DEVICE static void print(const xor_t<LowLengths>& x)
+{
+    printf("xor_t{");
+    printf("up_lengths_: ");
+    print(x.up_lengths_);
+    printf("}");
+}
+
 template <typename LowLength, typename OffsetLength>
 struct offset : public base_transform<1, 1>
 {
@@ -1509,24 +1458,19 @@ struct offset : public base_transform<1, 1>
         return ck_tile::is_known_at_compile_time<UpLengths>::value &&
                ck_tile::is_known_at_compile_time<OffsetLength>::value;
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("offset{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        //
-        printf("offset_length_: ");
-        print(offset_length_);
-
-        printf("}");
-    }
 };
 
+template <typename LowLength, typename OffsetLength>
+CK_TILE_HOST_DEVICE static void print(const offset<LowLength, OffsetLength>& o)
+{
+    printf("offset{");
+    printf("up_lengths_: ");
+    print(o.up_lengths_);
+    printf(", offset_length_: ");
+    print(o.offset_length_);
+    printf("}");
+}
+
 template <typename UpLength, typename IndexingAdaptor>
 struct indexing : public base_transform<1, 1>
 {
@@ -1595,20 +1539,19 @@ struct indexing : public base_transform<1, 1>
         return ck_tile::is_known_at_compile_time<UpLengths>::value &&
                IndexingAdaptor::is_known_at_compile_time();
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("embed{");
-
-        //
-        printf("up_lengths_: ");
-        print(up_lengths_);
-        printf(", ");
-
-        printf("}");
-    }
 };
 
+template <typename UpLength, typename IndexingAdaptor>
+CK_TILE_HOST_DEVICE static void print(const indexing<UpLength, IndexingAdaptor>& i)
+{
+    printf("indexing{");
+    printf("up_lengths_: ");
+    print(i.up_lengths_);
+    printf(", iadaptor_: ");
+    print(i.iadaptor_);
+    printf("}");
+}
+
 //*******************************************************************************************************
 
 template <typename LowLength>
diff --git a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
index 8a3de3e5e0..1f6c389090 100644
--- a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
+++ b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp
@@ -77,6 +77,7 @@
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
+#include "ck_tile/core/utility/print.hpp"
 
 namespace ck_tile {
 
@@ -317,4 +318,51 @@ struct TileDistributionEncodingPattern2D<BlockSize,
     }
 };
 
+// Helper function to convert enum to string
+constexpr const char* tile_distribution_pattern_to_string(tile_distribution_pattern pattern)
+{
+    switch(pattern)
+    {
+    case tile_distribution_pattern::thread_raked: return "thread_raked";
+    case tile_distribution_pattern::warp_raked: return "warp_raked";
+    case tile_distribution_pattern::block_raked: return "block_raked";
+    default: return "unknown";
+    }
+}
+
+template <index_t BlockSize,
+          index_t YPerTile,
+          index_t XPerTile,
+          index_t VecSize,
+          tile_distribution_pattern DistributionPattern,
+          index_t NumWaveGroups>
+CK_TILE_HOST_DEVICE void print(const TileDistributionEncodingPattern2D<BlockSize,
+                                                                       YPerTile,
+                                                                       XPerTile,
+                                                                       VecSize,
+                                                                       DistributionPattern,
+                                                                       NumWaveGroups>&)
+{
+    using PatternType = TileDistributionEncodingPattern2D<BlockSize,
+                                                          YPerTile,
+                                                          XPerTile,
+                                                          VecSize,
+                                                          DistributionPattern,
+                                                          NumWaveGroups>;
+
+    printf("TileDistributionEncodingPattern2D<BlockSize:%d, YPerTile:%d, XPerTile:%d, "
+           "VecSize:%d, %s>: ",
+           BlockSize,
+           YPerTile,
+           XPerTile,
+           VecSize,
+           tile_distribution_pattern_to_string(DistributionPattern));
+    printf("{<Y0, Y1, Y2>: <%d, %d, %d>, <X0, X1>: <%d, %d>}\n",
+           PatternType::Y0,
+           PatternType::Y1,
+           PatternType::Y2,
+           PatternType::X0,
+           PatternType::X1);
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 29cc3fefe5..35da19cd3e 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -2754,54 +2754,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
 #endif
 }
 
-template <typename T, index_t NumElemsPerThread>
-CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
-                                                  const index_t global_offset,
-                                                  T* lds_base_ptr,
-                                                  const index_t lds_offset,
-                                                  const bool is_valid,
-                                                  const index_t src_element_space_size)
-{
-    const uint32_t* global_ptr =
-        reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
-    const int32x4_t src_resource =
-        make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T));
-    const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
-
-#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
-    T* lds_ptr = lds_base_ptr + lds_offset;
-    auto const lds_ptr_sgpr =
-        __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
-    asm volatile("s_mov_b32 m0, %0; \n\t"
-                 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
-                 "v"(global_offset_bytes),
-                 "s"(src_resource)
-                 : "memory");
-#else
-    // Direct loads require that each thread reads and writes exactly a single DWORD.
-#if defined(__gfx9__)
-    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
-#endif
-    // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
-    // For gfx950: supports 1, 3, or 4 DWORDs per thread
-    // For gfx942: supports exactly 1 DWORD per thread
-#if defined(__gfx950__)
-    constexpr auto dword_bytes = 4;
-    static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
-                  bytes_per_thread == dword_bytes * 4);
-#elif defined(__gfx9__)
-    constexpr auto dword_bytes = 4;
-    static_assert(bytes_per_thread == dword_bytes);
-#endif
-    // LDS pointer must be attributed with the LDS address space.
-    as3_uint32_ptr lds_ptr =
-        reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
-
-    llvm_amdgcn_raw_buffer_load_lds(
-        src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
-#endif
-}
-
 #if defined(__gfx950__)
 template <typename T, index_t N, address_space_enum BufferAddressSpace>
 __device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 27c99e0cda..5946ed54af 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -178,22 +178,6 @@ CK_TILE_DEVICE void s_waitcnt_barrier()
     __builtin_amdgcn_s_barrier();
 }
 
-CK_TILE_DEVICE void block_sync_lds_direct_load()
-{
-#if 1
-    // invoke clang builtins which *should* produce the same result as the inline asm below
-    // difference: inline asm is being compiled to wait vmcnt(0) after the barrier
-    s_waitcnt_barrier<0, waitcnt_arg::kMaxExpCnt, 0>();
-#else
-    // same content as in old CK (#999)
-    asm volatile("\
-    s_waitcnt vmcnt(0) \n \
-    s_waitcnt lgkmcnt(0) \n \
-    s_barrier \
-    " ::);
-#endif
-}
-
 CK_TILE_DEVICE void s_nop(index_t cnt = 0)
 {
 #if 1
@@ -238,6 +222,21 @@ CK_TILE_HOST_DEVICE constexpr index_t get_smem_capacity()
 #endif
 }
 
+/// Helper function to convert address space enum to string
+CK_TILE_HOST_DEVICE constexpr const char* address_space_to_string(address_space_enum addr_space)
+{
+    switch(addr_space)
+    {
+    case address_space_enum::generic: return "generic";
+    case address_space_enum::global: return "global";
+    case address_space_enum::lds: return "lds";
+    case address_space_enum::sgpr: return "sgpr";
+    case address_space_enum::constant: return "constant";
+    case address_space_enum::vgpr: return "vgpr";
+    default: return "unknown";
+    }
+}
+
 // Architecture tags
 struct gfx11_t
 {
@@ -254,5 +253,4 @@ CK_TILE_DEVICE static constexpr auto get_device_arch()
     return gfx12_t{};
 #endif
 }
-
 } // namespace ck_tile
diff --git a/include/ck_tile/core/container/array.hpp b/include/ck_tile/core/container/array.hpp
index 94aa40e278..352c645325 100644
--- a/include/ck_tile/core/container/array.hpp
+++ b/include/ck_tile/core/container/array.hpp
@@ -177,9 +177,27 @@ struct array<T, 0>
     CK_TILE_HOST_DEVICE constexpr array() {}
     CK_TILE_HOST_DEVICE static constexpr index_t size() { return 0; }
     CK_TILE_HOST_DEVICE static constexpr bool is_static() { return is_static_v<T>; };
-    CK_TILE_HOST_DEVICE void print() const { printf("array{size: 0, data: []}"); }
 };
 
+template <typename T, index_t N>
+CK_TILE_HOST_DEVICE static void print(const array<T, N>& a)
+{
+    printf("array{size: %ld, data: [", static_cast<long>(N));
+    for(index_t i = 0; i < N; ++i)
+    {
+        if(i > 0)
+            printf(", ");
+        print(a[i]);
+    }
+    printf("]}");
+}
+
+template <typename T>
+CK_TILE_HOST_DEVICE static void print(const array<T, 0>&)
+{
+    printf("array{size: 0, data: []}");
+}
+
 template <typename, typename>
 struct vector_traits;
 
diff --git a/include/ck_tile/core/container/map.hpp b/include/ck_tile/core/container/map.hpp
index 87b180cafc..7697995c92 100644
--- a/include/ck_tile/core/container/map.hpp
+++ b/include/ck_tile/core/container/map.hpp
@@ -139,26 +139,21 @@ struct map
 
     // WARNING: needed by compiler for C++ range-based for loop only, don't use this function!
     CK_TILE_HOST_DEVICE constexpr iterator end() { return iterator{impl_, size_}; }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("map{size_: %d, ", size_);
-        //
-        printf("impl_: [");
-        //
-        for(const auto& [k, d] : *this)
-        {
-            printf("{key: ");
-            print(k);
-            printf(", data: ");
-            print(d);
-            printf("}, ");
-        }
-        //
-        printf("]");
-        //
-        printf("}");
-    }
 };
 
+template <typename key, typename data, index_t max_size>
+CK_TILE_HOST_DEVICE static void print(const map<key, data, max_size>& m)
+{
+    printf("map{size_: %d, impl_: [", m.size_);
+    for(const auto& [k, d] : m)
+    {
+        printf("{key: ");
+        print(k);
+        printf(", data: ");
+        print(d);
+        printf("}, ");
+    }
+    printf("]}");
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index 94309dd5dd..905b32dd15 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -9,13 +9,10 @@
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/utility/to_sequence.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
-#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/print.hpp"
 
 namespace ck_tile {
 
-template <index_t, index_t, index_t>
-struct static_for;
-
 template <index_t...>
 struct sequence;
 
@@ -196,15 +193,24 @@ struct sequence
     {
         return sequence<f(Is)...>{};
     }
-
-    CK_TILE_HOST_DEVICE static void print()
-    {
-        printf("sequence{size: %d, data: [", size());
-        ((printf("%d ", Is)), ...);
-        printf("]}");
-    }
 };
 
+template <index_t... Is>
+CK_TILE_HOST_DEVICE static void print(const sequence<Is...>&)
+{
+    printf("sequence<");
+    if constexpr(sizeof...(Is) > 0)
+    {
+        bool first = true;
+        (([&first](index_t value) {
+             printf("%s%d", first ? "" : ", ", value);
+             first = false;
+         }(Is)),
+         ...);
+    }
+    printf(">");
+}
+
 namespace impl {
 template <typename T, T... Ints>
 struct __integer_sequence;
diff --git a/include/ck_tile/core/container/thread_buffer.hpp b/include/ck_tile/core/container/thread_buffer.hpp
index 77c46e1b8c..d67581e7d2 100644
--- a/include/ck_tile/core/container/thread_buffer.hpp
+++ b/include/ck_tile/core/container/thread_buffer.hpp
@@ -42,7 +42,11 @@ struct thread_buffer {
 
     // TODO: this ctor can't ignore
     CK_TILE_HOST_DEVICE constexpr thread_buffer() : data{} {}
-    CK_TILE_HOST_DEVICE constexpr thread_buffer(const value_type & o) : data{o} {}
+    CK_TILE_HOST_DEVICE constexpr thread_buffer(const value_type & o) : data{} {
+        static_for<0, N, 1>{}(
+            [&](auto i) { data[i] = o; }
+        );
+    }
 
     CK_TILE_HOST_DEVICE static constexpr auto size() { return N; }
     CK_TILE_HOST_DEVICE auto & get() {return data; }
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index 63d145d8b9..4c48b3d477 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -300,12 +300,29 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
 #undef TP_COM_
 };
 
-template <typename, typename = void>
+template <typename... T>
+CK_TILE_HOST_DEVICE void print(const tuple<T...>& t)
+{
+    printf("tuple<");
+    if constexpr(sizeof...(T) > 0)
+    {
+        bool first = true;
+        static_for<0, sizeof...(T), 1>{}([&t, &first](auto i) {
+            if(!first)
+                printf(", ");
+            print(t.get(i));
+            first = false;
+        });
+    }
+    printf(">");
+}
+
+template <typename, typename>
 struct vector_traits;
 
 // specialization for array
 template <typename... T>
-struct vector_traits<tuple<T...>>
+struct vector_traits<tuple<T...>, void>
 {
     using scalar_type                    = __type_pack_element<0, T...>;
     static constexpr index_t vector_size = sizeof...(T);
diff --git a/include/ck_tile/core/numeric/e8m0.hpp b/include/ck_tile/core/numeric/e8m0.hpp
new file mode 100644
index 0000000000..ea94880f27
--- /dev/null
+++ b/include/ck_tile/core/numeric/e8m0.hpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/mxfp_convert.hpp"
+
+namespace ck_tile {
+
+/**
+ * @brief Unsigned representation of a conventional biased Float32 exponent.
+ *
+ * bias = 127;
+ *
+ * E8M0_1   = 0b01111111; => 2^(127-127) = 1
+ * E8M0_2   = 0b10000000; => 2^(128-127) = 2^1 = 2
+ * E8M0_3   = 0b10000010; => 2^(130-127) = 2^3 = 8
+ * E8M0_135 = 0b10000111; => 2^(135-127) = 2^8 = 256
+ * E8M0_142 = 0b10001110; => 2^(142-127) = 2^15 = 32768
+ * E8M0_MIN = 0b00000000; => 2^-127
+ * E8M0_MAX = 0b11111110; => 2^127
+ * E8M0_NAN = 0b11111111; => NaN
+ */
+
+struct e8m0_bexp_t
+{
+    using raw_type = uint8_t;
+    using type     = raw_type;
+
+    raw_type data;
+
+    CK_TILE_HOST_DEVICE constexpr e8m0_bexp_t() : data{type{0b11111111}} {}
+    CK_TILE_HOST_DEVICE explicit constexpr e8m0_bexp_t(type init) : data{init} {}
+    CK_TILE_HOST_DEVICE explicit constexpr e8m0_bexp_t(float scale)
+        : e8m0_bexp_t(static_cast<type>(numeric_utils<float>::get_exponent(scale)))
+    {
+    }
+    CK_TILE_HOST_DEVICE constexpr operator type() const { return data; }
+    CK_TILE_HOST_DEVICE constexpr raw_type& get() { return data; }
+    CK_TILE_HOST_DEVICE constexpr raw_type get() const { return data; }
+    CK_TILE_HOST_DEVICE constexpr operator float() const;
+
+    constexpr bool operator==(const e8m0_bexp_t& other) const { return data == other.data; }
+
+    constexpr bool operator!=(const e8m0_bexp_t& other) const { return data != other.data; }
+};
+
+using e8m0_t     = e8m0_bexp_t;
+using e8m0_raw_t = typename e8m0_t::raw_type;
+
+template <>
+struct numeric_traits<e8m0_t>
+{
+    using bitwise_type = e8m0_raw_t;
+
+    static constexpr int exp        = 8;
+    static constexpr int mant       = 0;
+    static constexpr int bias       = 127;
+    static constexpr int PackedSize = 1;
+};
+
+// limits
+template <class T>
+struct numeric;
+
+template <>
+struct numeric<e8m0_t>
+{
+    static constexpr e8m0_raw_t binary_min = 0b00000000; // 2^-127
+    static constexpr e8m0_raw_t binary_max = 0b11111110; // 2^127
+    static constexpr e8m0_raw_t binary_nan = 0b11111111;
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t min() { return e8m0_t{binary_min}; }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t max() { return e8m0_t{binary_max}; }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t quiet_NaN() { return e8m0_t{binary_nan}; }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t signaling_NaN() { return e8m0_t{binary_nan}; }
+    CK_TILE_HOST_DEVICE static constexpr bool has_inf() { return false; }
+
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t epsilon() { return signaling_NaN(); }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t round_error() { return signaling_NaN(); }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t zero() { return signaling_NaN(); }
+    CK_TILE_HOST_DEVICE static constexpr e8m0_t infinity() { return signaling_NaN(); }
+};
+
+CK_TILE_HOST_DEVICE constexpr e8m0_bexp_t::operator float() const
+{
+    using traits = numeric_traits<float>;
+    if(data == numeric<e8m0_t>::binary_nan)
+    {
+        return traits::NaN;
+    }
+    else if(data == 0)
+    {
+        return std::numeric_limits<float>::min();
+    }
+    else
+    {
+        return bit_cast<float>(static_cast<traits::bitwise_type>(data) << traits::mant);
+    }
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/numeric/integral_constant.hpp b/include/ck_tile/core/numeric/integral_constant.hpp
index 33c24da8c5..2ba2fd10c6 100644
--- a/include/ck_tile/core/numeric/integral_constant.hpp
+++ b/include/ck_tile/core/numeric/integral_constant.hpp
@@ -19,14 +19,18 @@ struct constant
     CK_TILE_HOST_DEVICE static constexpr bool is_static() { return true; }
 };
 
+template <auto v>
+CK_TILE_HOST_DEVICE static void print(const constant<v>&)
+{
+    printf("%ld", static_cast<long>(v));
+}
+
 template <typename T, T v>
 struct integral_constant : constant<v>
 {
     using value_type         = T;
     using type               = integral_constant; // using injected-class-name
     static constexpr T value = v;
-    // constexpr CK_TILE_HOST_DEVICE operator   value_type() const noexcept { return value; }
-    // constexpr CK_TILE_HOST_DEVICE value_type operator()() const noexcept { return value; } //
 };
 
 template <index_t v>
diff --git a/include/ck_tile/core/numeric/mxfp_convert.hpp b/include/ck_tile/core/numeric/mxfp_convert.hpp
index b2e138e880..9b378933d0 100644
--- a/include/ck_tile/core/numeric/mxfp_convert.hpp
+++ b/include/ck_tile/core/numeric/mxfp_convert.hpp
@@ -12,15 +12,19 @@ struct numeric_utils : numeric_traits<T>
 
     using traits   = numeric_traits<T>;
     using _numeric = numeric<T>;
-    using raw_type = typename T::raw_type;
+    using raw_type = typename traits::bitwise_type;
 
     static constexpr int exp_mask = (1 << traits::exp) - 1;
 
-    static constexpr int get_exponent(raw_type x)
+    static constexpr raw_type get_exponent(raw_type x)
     {
         // TODO: check if repeated calls are optimized.
         return (x >> traits::mant) & exp_mask;
     }
+    static constexpr raw_type get_exponent(const T& x)
+    {
+        return get_exponent(bit_cast<raw_type>(x));
+    }
     static constexpr bool is_positive(raw_type x)
     {
         return (x >> (traits::exp + traits::mant)) == _numeric::binary_zero;
@@ -33,7 +37,7 @@ struct numeric_utils : numeric_traits<T>
     static constexpr double get_mantissa(raw_type x)
     {
         double mantissa = is_subnormal(x) ? 0.0f : 1.0f;
-        for(uint32_t i = 0; i < traits::mant; ++i)
+        for(raw_type i = 0; i < traits::mant; ++i)
         {
             mantissa += std::ldexp(static_cast<float>(x & 0b1), -(traits::mant - i));
             x >>= 1;
@@ -43,22 +47,23 @@ struct numeric_utils : numeric_traits<T>
 };
 
 template <typename T>
-CK_TILE_HOST_DEVICE float convert_to_float(typename T::raw_type data, int scale_exp = 127)
+CK_TILE_HOST_DEVICE float convert_to_float(typename T::raw_type data, float scale = 1.f)
 {
-    using utils                    = numeric_utils<T>;
-    static constexpr int e8m0_bias = 127; // TODO: make it generic.
-    float sign                     = utils::is_positive(data) ? 1.0 : -1.0;
-    int exp    = (utils::is_subnormal(data) ? 1 : utils::get_exponent(data)) - utils::bias;
-    float mant = utils::get_mantissa(data);
+    using utils = numeric_utils<T>;
+    float sign  = utils::is_positive(data) ? 1.0 : -1.0;
+    int exp     = (utils::is_subnormal(data) ? 1 : utils::get_exponent(data)) - utils::bias;
+    float mant  = utils::get_mantissa(data);
 
-    return std::ldexp(sign * mant, exp + scale_exp - e8m0_bias);
+    return std::ldexp(sign * mant * scale, exp);
 }
 
 template <typename T>
-CK_TILE_HOST_DEVICE typename T::raw_type convert_to_type(float value)
+CK_TILE_HOST_DEVICE typename T::raw_type convert_to_type(float value, float scale = 1.f)
 {
     using bitwise_type = typename numeric_traits<T>::bitwise_type;
 
+    value /= scale;
+
     if(std::abs(value) > float(numeric<T>::max()))
     {
         float max_value = numeric<T>::max();
diff --git a/include/ck_tile/core/numeric/pk_fp4.hpp b/include/ck_tile/core/numeric/pk_fp4.hpp
index 0dee750b69..a345cd1b75 100644
--- a/include/ck_tile/core/numeric/pk_fp4.hpp
+++ b/include/ck_tile/core/numeric/pk_fp4.hpp
@@ -23,14 +23,11 @@ using fp32x2_t = float __attribute__((ext_vector_type(2)));
 using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
 using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
 
-CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float);
+CK_TILE_HOST_DEVICE constexpr uint8_t float_to_e2m1(float x, float scale = 1.f);
 
 // TODO: Add stochastic method
 struct pk_float4_e2m1_t
 {
-    static constexpr int exponent = 2;
-    static constexpr int mantissa = 1;
-    static constexpr int bias     = 1;
     // TODO: Can we merge raw_type and type?
     using raw_type = uint8_t;
     using type     = raw_type;
@@ -41,18 +38,27 @@ struct pk_float4_e2m1_t
     CK_TILE_HOST_DEVICE constexpr pk_float4_e2m1_t(T init) : data{static_cast<type>(init)}
     {
     }
-    CK_TILE_HOST_DEVICE explicit constexpr pk_float4_e2m1_t(float init) : data{float_to_e2m1(init)}
+    CK_TILE_HOST_DEVICE explicit constexpr pk_float4_e2m1_t(float init, float scale = 1.f)
+        : data{float_to_e2m1(init, scale)}
     {
     }
     CK_TILE_HOST_DEVICE constexpr operator type() const { return data; }
     CK_TILE_HOST_DEVICE constexpr raw_type& get() { return data; }
     CK_TILE_HOST_DEVICE constexpr raw_type get() const { return data; }
-    CK_TILE_HOST_DEVICE constexpr operator float() const;
-    CK_TILE_HOST_DEVICE constexpr operator fp32x2_t() const;
-    CK_TILE_HOST_DEVICE constexpr operator fp16_t() const;
-    CK_TILE_HOST_DEVICE constexpr operator fp16x2_t() const;
-    CK_TILE_HOST_DEVICE constexpr operator bf16_t() const;
-    CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const;
+
+    CK_TILE_HOST_DEVICE constexpr float to_float(float scale = 1.f) const;
+    CK_TILE_HOST_DEVICE constexpr fp32x2_t to_fp32x2(float scale = 1.f) const;
+    CK_TILE_HOST_DEVICE constexpr fp16_t to_fp16(float scale = 1.f) const;
+    CK_TILE_HOST_DEVICE constexpr fp16x2_t to_fp16x2(float scale = 1.f) const;
+    CK_TILE_HOST_DEVICE constexpr bf16_t to_bf16(float scale = 1.f) const;
+    CK_TILE_HOST_DEVICE constexpr bf16x2_t to_bf16x2(float scale = 1.f) const;
+
+    CK_TILE_HOST_DEVICE constexpr operator float() const { return to_float(); }
+    CK_TILE_HOST_DEVICE constexpr operator fp32x2_t() const { return to_fp32x2(); }
+    CK_TILE_HOST_DEVICE constexpr operator fp16_t() const { return to_fp16(); }
+    CK_TILE_HOST_DEVICE constexpr operator fp16x2_t() const { return to_fp16x2(); }
+    CK_TILE_HOST_DEVICE constexpr operator bf16_t() const { return to_bf16(); }
+    CK_TILE_HOST_DEVICE constexpr operator bf16x2_t() const { return to_bf16x2(); }
 
     template <index_t I>
     CK_TILE_HOST_DEVICE constexpr raw_type unpack(number<I>) const;
@@ -191,131 +197,160 @@ CK_TILE_DEVICE pk_fp4_raw_t _to_f4(T src, float scale = 1.0f)
 } // namespace impl
 #endif
 
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator bf16_t() const
+CK_TILE_HOST_DEVICE constexpr bf16_t pk_fp4_t::to_bf16(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<bf16_t>(data);
+    return impl::_from_f4<bf16_t>(data, scale);
 #else
-    return bf16_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{})))};
+    return bf16_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale))};
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator bf16x2_t() const
+
+CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_t::to_bf16x2(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<bf16x2_t>(data);
+    return impl::_from_f4<bf16x2_t>(data, scale);
 #else
-    return bf16x2_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}))),
-                    type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{})))};
+    return bf16x2_t{type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale)),
+                    type_convert<bf16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{}), scale))};
 #endif
 }
 
 // TODO: make float_to_e2m1 generic so that we can convert from directrly.
-CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_e2m1(float x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_raw_t float_to_e2m1(float x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return convert_to_type<pk_fp4_t>(x);
+    return convert_to_type<pk_fp4_t>(x, scale);
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_to_fp32x2(const pk_fp4_t& x) { return fp32x2_t(x); }
-CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_to_fp16x2(const pk_fp4_t& x) { return fp16x2_t(x); }
-CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_to_bf16x2(const pk_fp4_t& x) { return bf16x2_t(x); }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t float_to_pk_fp4(const float& x) { return float_to_e2m1(x); }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16_to_pk_fp4(const fp16_t& x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t float_to_pk_fp4(const float& x, float scale)
+{
+    return float_to_e2m1(x, scale);
+}
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16_to_pk_fp4(const fp16_t& x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return float_to_e2m1(type_convert<float>(x));
+    return float_to_e2m1(type_convert<float>(x), scale);
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4(const bf16_t& x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16_to_pk_fp4(const bf16_t& x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return float_to_e2m1(type_convert<float>(x));
+    return float_to_e2m1(type_convert<float>(x), scale);
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4(const fp16x2_t& x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp16x2_to_pk_fp4(const fp16x2_t& x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return pk_fp4_t::pack(float_to_e2m1(type_convert<float>(x[0])),
-                          float_to_e2m1(type_convert<float>(x[1])));
+    return pk_fp4_t::pack(float_to_e2m1(type_convert<float>(x[0]), scale),
+                          float_to_e2m1(type_convert<float>(x[1]), scale));
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4(const bf16x2_t& x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t bf16x2_to_pk_fp4(const bf16x2_t& x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return pk_fp4_t::pack(float_to_e2m1(type_convert<float>(x[0])),
-                          float_to_e2m1(type_convert<float>(x[1])));
+    return pk_fp4_t::pack(float_to_e2m1(type_convert<float>(x[0]), scale),
+                          float_to_e2m1(type_convert<float>(x[1]), scale));
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4(const fp32x2_t& x)
+CK_TILE_HOST_DEVICE constexpr pk_fp4_t fp32x2_to_pk_fp4(const fp32x2_t& x, float scale)
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_to_f4(x);
+    return impl::_to_f4(x, scale);
 #else
-    return pk_fp4_t::pack(float_to_e2m1(x[0]), float_to_e2m1(x[1]));
+    return pk_fp4_t::pack(float_to_e2m1(x[0], scale), float_to_e2m1(x[1], scale));
 #endif
 }
 
+CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_to_fp32x2(const pk_fp4_t& x, float scale)
+{
+    return x.to_fp32x2(scale);
+}
+CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_to_fp16x2(const pk_fp4_t& x, float scale)
+{
+    return x.to_fp16x2(scale);
+}
+CK_TILE_HOST_DEVICE constexpr bf16x2_t pk_fp4_to_bf16x2(const pk_fp4_t& x, float scale)
+{
+    return x.to_bf16x2(scale);
+}
+CK_TILE_HOST_DEVICE constexpr float pk_fp4_to_float(const pk_fp4_t& x, float scale)
+{
+    return x.to_float(scale);
+}
+CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_to_fp16(const pk_fp4_t& x, float scale)
+{
+    return x.to_fp16(scale);
+}
+CK_TILE_HOST_DEVICE constexpr bf16_t pk_fp4_to_bf16(const pk_fp4_t& x, float scale)
+{
+    return x.to_bf16(scale);
+}
+
 #if TEST_convert_with_table == 0
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator float() const
+CK_TILE_HOST_DEVICE constexpr float pk_fp4_t::to_float(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<fp32_t>(data);
+    return impl::_from_f4<fp32_t>(data, scale);
 #else
-    return convert_to_float<pk_fp4_t>(unpack(number<0>{}));
+    return convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale);
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp32x2_t() const
+CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_t::to_fp32x2(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<fp32x2_t>(data);
+    return impl::_from_f4<fp32x2_t>(data, scale);
 #else
-    return fp32x2_t{convert_to_float<pk_fp4_t>(unpack(number<0>{})),
-                    convert_to_float<pk_fp4_t>(unpack(number<1>{}))};
+    return fp32x2_t{convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale),
+                    convert_to_float<pk_fp4_t>(unpack(number<1>{}), scale)};
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16_t() const
+
+CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_t::to_fp16(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<fp16_t>(data);
+    return impl::_from_f4<fp16_t>(data, scale);
 #else
-    return fp16_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{})))};
+    return fp16_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale))};
 #endif
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16x2_t() const
+CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_t::to_fp16x2(float scale) const
 {
 #if CK_TILE_FP4_CVT_DEVICE
-    return impl::_from_f4<fp16x2_t>(data);
+    return impl::_from_f4<fp16x2_t>(data, scale);
 #else
-    return fp16x2_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}))),
-                    type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{})))};
+    return fp16x2_t{type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<0>{}), scale)),
+                    type_convert<fp16_t>(convert_to_float<pk_fp4_t>(unpack(number<1>{}), scale))};
 #endif
 }
 #else
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator float() const
+CK_TILE_HOST_DEVICE constexpr float pk_fp4_t::to_float(float scale) const
 {
-    return e2m1_to_fp32_table[data & 0xf];
+    return e2m1_to_fp32_table[unpack(number<0>{})] * scale;
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp32x2_t() const
+CK_TILE_HOST_DEVICE constexpr fp32x2_t pk_fp4_t::to_fp32x2(float scale) const
 {
-    return fp32x2_t{e2m1_to_fp32_table[data & 0xf], e2m1_to_fp32_table[data >> 4]};
+    return fp32x2_t{e2m1_to_fp32_table[unpack(number<0>{})] * scale, e2m1_to_fp32_table[unpack(number<1>{}] * scale};
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16_t() const
+CK_TILE_HOST_DEVICE constexpr fp16_t pk_fp4_t::to_fp16(float scale) const
 {
-    return e2m1_to_fp16_table[data & 0xf];
+    return type_convert<float>(e2m1_to_fp16_table[unpack(number<0>{})]) * scale;
 }
-CK_TILE_HOST_DEVICE constexpr pk_fp4_t::operator fp16x2_t() const
+CK_TILE_HOST_DEVICE constexpr fp16x2_t pk_fp4_t::to_fp16x2(float scale) const
 {
-    return fp16x2_t{e2m1_to_fp16_table[data & 0xf], e2m1_to_fp16_table[data >> 4]};
+    return fp16x2_t{
+        type_convert<fp16_t>(type_convert<float>(e2m1_to_fp16_table[unpack(number<0>{})]) * scale),
+        type_convert<fp16_t>(type_convert<float>(e2m1_to_fp16_table[unpack(number<1>{})]) * scale)};
 }
 #endif
 
diff --git a/include/ck_tile/core/numeric/type_convert.hpp b/include/ck_tile/core/numeric/type_convert.hpp
index 94d6e3cd34..1455fce0ea 100644
--- a/include/ck_tile/core/numeric/type_convert.hpp
+++ b/include/ck_tile/core/numeric/type_convert.hpp
@@ -64,6 +64,7 @@ CK_TILE_TYPE_CONVERT(bf8_t, bf8, float, float)
 
 CK_TILE_TYPE_CONVERT(float, float, int8_t, int8)
 CK_TILE_TYPE_CONVERT(int8_t, int8, float, float)
+#undef CK_TILE_TYPE_CONVERT
 
 } // namespace ck_tile
 
@@ -71,16 +72,36 @@ CK_TILE_TYPE_CONVERT(int8_t, int8, float, float)
 
 namespace ck_tile {
 
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp32x2_t, fp32x2)
-CK_TILE_TYPE_CONVERT(fp32x2_t, fp32x2, pk_fp4_t, pk_fp4)
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16x2_t, fp16x2)
-CK_TILE_TYPE_CONVERT(fp16x2_t, fp16x2, pk_fp4_t, pk_fp4)
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16x2_t, bf16x2)
-CK_TILE_TYPE_CONVERT(bf16x2_t, bf16x2, pk_fp4_t, pk_fp4)
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, float, float)
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16_t, bf16)
-CK_TILE_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16_t, fp16)
-#undef CK_TILE_TYPE_CONVERT
+template <typename Y, typename X>
+CK_TILE_HOST_DEVICE constexpr Y scaled_type_convert(X x, float scale);
+
+#define CK_TILE_SCALED_TYPE_CONVERT(dtype_, dname_, stype_, sname_)                       \
+    template <>                                                                           \
+    CK_TILE_HOST_DEVICE constexpr dtype_ scaled_type_convert<dtype_, stype_>(stype_ x,    \
+                                                                             float scale) \
+    {                                                                                     \
+        return sname_##_to_##dname_(x, scale);                                            \
+    }                                                                                     \
+    template <>                                                                           \
+    CK_TILE_HOST_DEVICE constexpr dtype_ type_convert<dtype_, stype_>(stype_ x)           \
+    {                                                                                     \
+        return sname_##_to_##dname_(x, 1.f);                                              \
+    }
+
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp32x2_t, fp32x2)
+CK_TILE_SCALED_TYPE_CONVERT(fp32x2_t, fp32x2, pk_fp4_t, pk_fp4)
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16x2_t, fp16x2)
+CK_TILE_SCALED_TYPE_CONVERT(fp16x2_t, fp16x2, pk_fp4_t, pk_fp4)
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16x2_t, bf16x2)
+CK_TILE_SCALED_TYPE_CONVERT(bf16x2_t, bf16x2, pk_fp4_t, pk_fp4)
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, float, float)
+CK_TILE_SCALED_TYPE_CONVERT(float, float, pk_fp4_t, pk_fp4)
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, bf16_t, bf16)
+CK_TILE_SCALED_TYPE_CONVERT(bf16_t, bf16, pk_fp4_t, pk_fp4)
+CK_TILE_SCALED_TYPE_CONVERT(pk_fp4_t, pk_fp4, fp16_t, fp16)
+CK_TILE_SCALED_TYPE_CONVERT(fp16_t, fp16, pk_fp4_t, pk_fp4)
+#undef CK_TILE_SCALED_TYPE_CONVERT
+
 #endif
 
 } // namespace ck_tile
diff --git a/include/ck_tile/core/numeric/vector_type.hpp b/include/ck_tile/core/numeric/vector_type.hpp
index b165275a8c..58bdb43b08 100644
--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -84,7 +84,7 @@ using ext_vector_t = typename impl::ext_vector<T, N>::type;
 
 // by default, any type will result in a vector_size=1 with scalar_type=T traits.
 // ... unless we have other vector_traits specialization
-template <typename T, typename>
+template <typename T, typename = void>
 struct vector_traits
 {
     using scalar_type =
@@ -94,7 +94,7 @@ struct vector_traits
 
 // specialization for ext_vector_type()
 template <typename T, index_t N>
-struct vector_traits<T __attribute__((ext_vector_type(N)))>
+struct vector_traits<T __attribute__((ext_vector_type(N))), void>
 {
     using scalar_type = std::conditional_t<std::is_same_v<T, pk_int4_t>, int8_t, T>;
     static constexpr index_t vector_size = N;
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index 4b39773939..ca314a6abe 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -210,28 +210,6 @@ struct buffer_view<address_space_enum::generic,
 
     // FIXME: remove
     CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("buffer_view{");
-
-        // AddressSpace
-        printf("AddressSpace: generic, ");
-
-        // p_data_
-        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
-
-        // buffer_size_
-        printf("buffer_size_: ");
-        print(buffer_size_);
-        printf(", ");
-
-        // invalid_element_value_
-        printf("invalid_element_value_: ");
-        print(invalid_element_value_);
-
-        printf("}");
-    }
 };
 
 // Address Space: Global
@@ -757,28 +735,6 @@ struct buffer_view<address_space_enum::global,
 
     // FIXME: remove
     CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("buffer_view{");
-
-        // AddressSpace
-        printf("AddressSpace: Global, ");
-
-        // p_data_
-        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
-
-        // buffer_size_
-        printf("buffer_size_: ");
-        print(buffer_size_);
-        printf(", ");
-
-        // invalid_element_value_
-        printf("invalid_element_value_: ");
-        print(invalid_element_value_);
-
-        printf("}");
-    }
 };
 
 // Address Space: LDS
@@ -1138,28 +1094,6 @@ struct buffer_view<address_space_enum::lds,
 
     // FIXME: remove
     CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("buffer_view{");
-
-        // AddressSpace
-        printf("AddressSpace: Lds, ");
-
-        // p_data_
-        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
-
-        // buffer_size_
-        printf("buffer_size_: ");
-        print(buffer_size_);
-        printf(", ");
-
-        // invalid_element_value_
-        printf("invalid_element_value_: ");
-        print(invalid_element_value_);
-
-        printf("}");
-    }
 };
 
 // Address Space: Vgpr
@@ -1313,28 +1247,6 @@ struct buffer_view<address_space_enum::vgpr,
 
     // FIXME: remove
     CK_TILE_DEVICE static constexpr bool is_dynamic_buffer() { return true; }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("buffer_view{");
-
-        // AddressSpace
-        printf("AddressSpace: Vgpr, ");
-
-        // p_data_
-        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
-
-        // buffer_size_
-        printf("buffer_size_: ");
-        print(buffer_size_);
-        printf(", ");
-
-        // invalid_element_value_
-        printf("invalid_element_value_: ");
-        print(invalid_element_value_);
-
-        printf("}");
-    }
 };
 
 template <address_space_enum BufferAddressSpace,
@@ -1360,4 +1272,25 @@ make_buffer_view(T* p, BufferSizeType buffer_size, X invalid_element_value)
         p, buffer_size, invalid_element_value};
 }
 
+// Generalized print function for all buffer_view variants
+template <address_space_enum BufferAddressSpace,
+          typename T,
+          typename BufferSizeType,
+          bool InvalidElementUseNumericalZeroValue,
+          amd_buffer_coherence_enum Coherence>
+CK_TILE_HOST_DEVICE void print(const buffer_view<BufferAddressSpace,
+                                                 T,
+                                                 BufferSizeType,
+                                                 InvalidElementUseNumericalZeroValue,
+                                                 Coherence>& bv)
+{
+    printf("buffer_view{AddressSpace: %s, p_data_: %p, buffer_size_: ",
+           address_space_to_string(BufferAddressSpace),
+           static_cast<void*>(const_cast<remove_cvref_t<T>*>(bv.p_data_)));
+    print(bv.buffer_size_);
+    printf(", invalid_element_value_: ");
+    print(bv.invalid_element_value_);
+    printf("}");
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/null_tile_window.hpp b/include/ck_tile/core/tensor/null_tile_window.hpp
index de99be1965..f7eca73afb 100644
--- a/include/ck_tile/core/tensor/null_tile_window.hpp
+++ b/include/ck_tile/core/tensor/null_tile_window.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -53,10 +53,13 @@ struct is_null_tile_window<null_tile_window<T>> : public std::true_type
 };
 } // namespace impl
 
+template <typename T>
+constexpr bool is_null_tile_window_v = impl::is_null_tile_window<remove_cvref_t<T>>::value;
+
 template <typename T>
 CK_TILE_DEVICE constexpr auto is_null_tile_window(const T&)
 {
-    return impl::is_null_tile_window<remove_cvref_t<T>>::value;
+    return is_null_tile_window_v<remove_cvref_t<T>>;
 }
 
 template <typename WindowLengths>
diff --git a/include/ck_tile/core/tensor/tensor_adaptor.hpp b/include/ck_tile/core/tensor/tensor_adaptor.hpp
index e2a6ae6555..ec5538d79c 100644
--- a/include/ck_tile/core/tensor/tensor_adaptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_adaptor.hpp
@@ -305,42 +305,45 @@ struct tensor_adaptor
                           get_container_subset(vector_strides, top_dims));
     }
 
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("tensor_adaptor{");
-
-        //
-        printf("transforms: ");
-        print(transforms_);
-        printf(", ");
-
-        //
-        printf("LowerDimensionHiddenIds: ");
-        print(LowerDimensionHiddenIdss{});
-        printf(", ");
-
-        //
-        printf("UpperDimensionHiddenIds: ");
-        print(UpperDimensionHiddenIdss{});
-        printf(", ");
-
-        //
-        printf("BottomDimensionHiddenIds: ");
-        print(BottomDimensionHiddenIds{});
-        printf(", ");
-
-        //
-        printf("TopDimensionHiddenIds: ");
-        print(TopDimensionHiddenIds{});
-
-        printf("}");
-    }
-
     private:
     Transforms transforms_;
     ElementSize element_size_;
 };
 
+template <typename Transforms,
+          typename LowerDimensionHiddenIdss,
+          typename UpperDimensionHiddenIdss,
+          typename BottomDimensionHiddenIds,
+          typename TopDimensionHiddenIds>
+CK_TILE_HOST_DEVICE static void print(const tensor_adaptor<Transforms,
+                                                           LowerDimensionHiddenIdss,
+                                                           UpperDimensionHiddenIdss,
+                                                           BottomDimensionHiddenIds,
+                                                           TopDimensionHiddenIds>& adaptor)
+{
+    printf("tensor_adaptor{\n");
+    printf("    transforms: [");
+    print(adaptor.get_transforms());
+    printf("],\n");
+
+    printf("    LowerDimensionHiddenIds: [");
+    print(LowerDimensionHiddenIdss{});
+    printf("],\n");
+
+    printf("    UpperDimensionHiddenIds: [");
+    print(UpperDimensionHiddenIdss{});
+    printf("],\n");
+
+    printf("    BottomDimensionHiddenIds: [");
+    print(BottomDimensionHiddenIds{});
+    printf("],\n");
+
+    //
+    printf("    TopDimensionHiddenIds: [");
+    print(TopDimensionHiddenIds{});
+    printf("]\n}\n");
+}
+
 // Transforms: Tuple<transforms...>
 // LowerDimensionOldTopIdss: Tuple<Sequence<...>, ...>
 // UpperDimensionNewTopIdss: Tuple<Sequence<...>, ...>
diff --git a/include/ck_tile/core/tensor/tensor_descriptor.hpp b/include/ck_tile/core/tensor/tensor_descriptor.hpp
index 0c3e04f315..0e4787a2f1 100644
--- a/include/ck_tile/core/tensor/tensor_descriptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_descriptor.hpp
@@ -140,25 +140,37 @@ struct tensor_descriptor : public tensor_adaptor<Transforms,
             to_array<index_t, ndim_hidden_>(GuaranteedVectorStrides{}));
     }
 
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("tensor_descriptor{");
-
-        // tensor_adaptor
-        Base::print();
-        printf(", ");
-
-        // element_space_size_
-        printf("element_space_size_: ");
-        print(element_space_size_);
-
-        printf("}");
-    }
-
     // TODO make these private
     ElementSpaceSize element_space_size_;
 };
 
+template <typename Transforms,
+          typename LowerDimensionHiddenIdss,
+          typename UpperDimensionHiddenIdss,
+          typename TopDimensionHiddenIds,
+          typename ElementSpaceSize,
+          typename GuaranteedVectorLengths,
+          typename GuaranteedVectorStrides>
+CK_TILE_HOST_DEVICE static void print(const tensor_descriptor<Transforms,
+                                                              LowerDimensionHiddenIdss,
+                                                              UpperDimensionHiddenIdss,
+                                                              TopDimensionHiddenIds,
+                                                              ElementSpaceSize,
+                                                              GuaranteedVectorLengths,
+                                                              GuaranteedVectorStrides>& descriptor)
+{
+    printf("tensor_descriptor{\n");
+    // first print the tensor adaptor part of the descriptor using the base class print
+    print(static_cast<const typename decltype(descriptor)::Base&>(descriptor));
+    printf("element_space_size_: %ld,\n",
+           static_cast<long>(descriptor.get_element_space_size().value));
+    printf("guaranteed_vector_lengths: ");
+    print(GuaranteedVectorLengths{});
+    printf(",\nguaranteed_vector_strides: ");
+    print(GuaranteedVectorStrides{});
+    printf("}\n}\n");
+}
+
 template <typename Adaptor, typename ElementSpaceSize>
 CK_TILE_HOST_DEVICE constexpr auto
 make_tensor_descriptor_from_adaptor(const Adaptor& adaptor,
diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp
index 11e6b35c39..bc02ec74d2 100644
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -228,24 +228,6 @@ struct tile_distribution
     {
         return PsYs2XsAdaptor::is_static() && Ys2DDescriptor::is_static();
     }
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("tile_distribution{");
-        //
-        printf("tile_distribution_encoding: ");
-        print(DstrEncode{});
-        printf(", ");
-        //
-        printf("ps_ys_to_xs_: ");
-        print(ps_ys_to_xs_);
-        printf(", ");
-        //
-        printf("ys_to_d_: ");
-        print(ys_to_d_);
-        //
-        printf("}");
-    }
 };
 
 namespace detail {
@@ -710,4 +692,27 @@ CK_TILE_HOST_DEVICE constexpr auto slice_distribution_from_x(
 }
 
 } // namespace detail
+
+// Free print function for tile_distribution
+template <typename PsYs2XsAdaptor_,
+          typename Ys2DDescriptor_,
+          typename StaticTileDistributionEncoding_,
+          typename TileDistributionDetail_>
+CK_TILE_HOST_DEVICE void print(const tile_distribution<PsYs2XsAdaptor_,
+                                                       Ys2DDescriptor_,
+                                                       StaticTileDistributionEncoding_,
+                                                       TileDistributionDetail_>& distribution)
+{
+    printf("tile_distribution{");
+    printf("tile_distribution_encoding: ");
+    print(StaticTileDistributionEncoding_{});
+    printf(", ");
+    printf("ps_ys_to_xs_: ");
+    print(distribution.ps_ys_to_xs_);
+    printf(", ");
+    printf("ys_to_d_: ");
+    print(distribution.ys_to_d_);
+    printf("}\n");
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
index b380e7c9d8..90d1a2ccb2 100644
--- a/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution_encoding.hpp
@@ -428,109 +428,7 @@ struct tile_distribution_encoding
         {
             return get_sorted_info(get_uniformed_idx_y_to_h(), get_h_dim_lengths_prefix_sum());
         }
-
-        CK_TILE_HOST_DEVICE void print() const
-        {
-            printf("tile_distribution_encoding::detail{");
-            //
-            printf("ndim_rh_major_: ");
-            print(ndim_rh_major_);
-            printf(", ");
-            //
-            printf("ndim_span_major_: ");
-            print(ndim_span_major_);
-            printf(", ");
-            //
-            printf("ndims_rhs_minor_: ");
-            print(ndims_rhs_minor_);
-            printf(", ");
-            //
-            printf("ndim_rh_major_: ");
-            print(ndim_rh_major_);
-            printf(", ");
-            //
-            printf("max_ndim_rh_minor_: ");
-            print(max_ndim_rh_minor_);
-            printf(", ");
-            //
-            printf("rhs_lengthss_: ");
-            print(rhs_lengthss_);
-            printf(", ");
-            //
-            printf("ys_lengths_: ");
-            print(ys_lengths_);
-            printf(", ");
-            //
-            printf("rhs_major_minor_to_ys_: ");
-            print(rhs_major_minor_to_ys_);
-            printf(", ");
-            //
-            printf("ndims_span_minor_: ");
-            print(ndims_span_minor_);
-            printf(", ");
-            //
-            printf("max_ndim_span_minor_: ");
-            print(max_ndim_span_minor_);
-            printf(", ");
-            //
-            printf("ys_to_span_major_: ");
-            print(ys_to_span_major_);
-            printf(", ");
-            //
-            printf("ys_to_span_minor_: ");
-            print(ys_to_span_minor_);
-            printf(", ");
-            //
-            printf("distributed_spans_lengthss_: ");
-            print(distributed_spans_lengthss_);
-            printf(", ");
-            //
-            printf("ndims_distributed_spans_minor_: ");
-            print(ndims_distributed_spans_minor_);
-            printf(", ");
-            //
-            printf("ps_over_rs_derivative_: ");
-            print(ps_over_rs_derivative_);
-            //
-            printf("}");
-        }
     };
-
-    CK_TILE_HOST_DEVICE void print() const
-    {
-        printf("tile_distribution_encoding{");
-        //
-        printf("NDimX: %d, NDimP: %d, NDimY: %d, ", NDimX, NDimP, NDimY);
-        //
-        printf("rs_lengths_: ");
-        print(rs_lengths_);
-        printf(", ");
-        //
-        printf("hs_lengthss_: ");
-        print(hs_lengthss_);
-        printf(", ");
-        //
-        printf("ps_to_rhss_major_: ");
-        print(ps_to_rhss_major_);
-        printf(", ");
-        //
-        printf("ps_to_rhss_minor_: ");
-        print(ps_to_rhss_minor_);
-        printf(", ");
-        //
-        printf("ys_to_rhs_major_: ");
-        print(ys_to_rhs_major_);
-        printf(", ");
-        //
-        printf("ys_to_rhs_minor_: ");
-        print(ys_to_rhs_minor_);
-        printf(", ");
-        //
-        printf("detail: ");
-        print(detail{});
-        //
-        printf("}");
-    }
 };
 
 template <typename encoding, typename shuffle>
@@ -896,4 +794,106 @@ make_reduce_tile_distribution_encoding(InDstr, sequence<InReduceDimXs...> reduce
 }
 
 } // namespace detail
+
+// Free print function for tile_distribution_encoding::detail
+template <typename RsLengths_,
+          typename HsLengthss_,
+          typename Ps2RHssMajor_,
+          typename Ps2RHssMinor_,
+          typename Ys2RHsMajor_,
+          typename Ys2RHsMinor_>
+CK_TILE_HOST_DEVICE void
+print(const typename tile_distribution_encoding<RsLengths_,
+                                                HsLengthss_,
+                                                Ps2RHssMajor_,
+                                                Ps2RHssMinor_,
+                                                Ys2RHsMajor_,
+                                                Ys2RHsMinor_>::detail& detail_obj)
+{
+    printf("tile_distribution_encoding::detail{");
+    printf("ndim_rh_major_: ");
+    print(detail_obj.ndim_rh_major_);
+    printf(", ");
+    printf("ndim_span_major_: ");
+    print(detail_obj.ndim_span_major_);
+    printf(", ");
+    printf("ndims_rhs_minor_: ");
+    print(detail_obj.ndims_rhs_minor_);
+    printf(", ");
+    printf("ndim_rh_major_: ");
+    print(detail_obj.ndim_rh_major_);
+    printf(", ");
+    printf("max_ndim_rh_minor_: ");
+    print(detail_obj.max_ndim_rh_minor_);
+    printf(", ");
+    printf("rhs_lengthss_: ");
+    print(detail_obj.rhs_lengthss_);
+    printf(", ");
+    printf("ys_lengths_: ");
+    print(detail_obj.ys_lengths_);
+    printf(", ");
+    printf("rhs_major_minor_to_ys_: ");
+    print(detail_obj.rhs_major_minor_to_ys_);
+    printf(", ");
+    printf("ndims_span_minor_: ");
+    print(detail_obj.ndims_span_minor_);
+    printf(", ");
+    printf("max_ndim_span_minor_: ");
+    print(detail_obj.max_ndim_span_minor_);
+    printf(", ");
+    printf("ys_to_span_major_: ");
+    print(detail_obj.ys_to_span_major_);
+    printf(", ");
+    printf("ys_to_span_minor_: ");
+    print(detail_obj.ys_to_span_minor_);
+    printf(", ");
+    printf("distributed_spans_lengthss_: ");
+    print(detail_obj.distributed_spans_lengthss_);
+    printf(", ");
+    printf("ndims_distributed_spans_minor_: ");
+    print(detail_obj.ndims_distributed_spans_minor_);
+    printf(", ");
+    printf("ps_over_rs_derivative_: ");
+    print(detail_obj.ps_over_rs_derivative_);
+    printf("}");
+}
+
+// Free print function for tile_distribution_encoding
+template <typename RsLengths_,
+          typename HsLengthss_,
+          typename Ps2RHssMajor_,
+          typename Ps2RHssMinor_,
+          typename Ys2RHsMajor_,
+          typename Ys2RHsMinor_>
+CK_TILE_HOST_DEVICE void print(const tile_distribution_encoding<RsLengths_,
+                                                                HsLengthss_,
+                                                                Ps2RHssMajor_,
+                                                                Ps2RHssMinor_,
+                                                                Ys2RHsMajor_,
+                                                                Ys2RHsMinor_>& encoding)
+{
+    printf("tile_distribution_encoding{");
+
+    printf("NDimX: %d, NDimP: %d, NDimY: %d, ", encoding.NDimX, encoding.NDimP, encoding.NDimY);
+    printf("rs_lengths_: ");
+    print(encoding.rs_lengths_);
+    printf(", ");
+    printf("hs_lengthss_: ");
+    print(encoding.hs_lengthss_);
+    printf(", ");
+    printf("ps_to_rhss_major_: ");
+    print(encoding.ps_to_rhss_major_);
+    printf(", ");
+    printf("ps_to_rhss_minor_: ");
+    print(encoding.ps_to_rhss_minor_);
+    printf(", ");
+    printf("ys_to_rhs_major_: ");
+    print(encoding.ys_to_rhs_major_);
+    printf(", ");
+    printf("ys_to_rhs_minor_: ");
+    print(encoding.ys_to_rhs_minor_);
+    printf(", ");
+    printf("}");
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/utility/print.hpp b/include/ck_tile/core/utility/print.hpp
new file mode 100644
index 0000000000..04635959af
--- /dev/null
+++ b/include/ck_tile/core/utility/print.hpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+
+namespace ck_tile {
+
+/// Declare a ck_tile::print() interface that gets specialized in each header file for types that
+/// can be printed.
+template <typename T>
+CK_TILE_HOST_DEVICE void print(const T&)
+{
+    static_assert(sizeof(T) == 0,
+                  "No print implementation available for this type. Please specialize "
+                  "ck_tile::print for your type.");
+}
+
+/// Specialization for int
+template <>
+CK_TILE_HOST_DEVICE void print(const int& value)
+{
+    printf("%d", value);
+}
+
+/// Specialization for float
+template <>
+CK_TILE_HOST_DEVICE void print(const float& value)
+{
+    printf("%f", value);
+}
+
+/// Specialization for double
+template <>
+CK_TILE_HOST_DEVICE void print(const double& value)
+{
+    printf("%f", value);
+}
+
+/// Specialization for long
+template <>
+CK_TILE_HOST_DEVICE void print(const long& value)
+{
+    printf("%ld", value);
+}
+
+/// Specialization for unsigned int
+template <>
+CK_TILE_HOST_DEVICE void print(const unsigned int& value)
+{
+    printf("%u", value);
+}
+
+/// Specialization for char
+template <>
+CK_TILE_HOST_DEVICE void print(const char& value)
+{
+    printf("%c", value);
+}
+
+/// Specialization for array
+template <typename T, size_t N>
+CK_TILE_HOST_DEVICE void print(const T (&value)[N])
+{
+    printf("[");
+    for(size_t i = 0; i < N; ++i)
+    {
+        if(i > 0)
+            printf(", ");
+        print(value[i]); // Recursively call print for each element
+    }
+    printf("]");
+}
+
+} // namespace ck_tile
diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp
index 8b15d187fe..2d7ac78b06 100644
--- a/include/ck_tile/core/utility/reduce_operator.hpp
+++ b/include/ck_tile/core/utility/reduce_operator.hpp
@@ -26,7 +26,8 @@ struct Add
     }
 
     template <typename T,
-              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t>>>
+              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const
     {
         float y_ = type_convert<float>(y);
@@ -34,6 +35,8 @@ struct Add
 
         return type_convert<T>(y_ + x_);
     }
+
+    static constexpr bool requires_special_combine = false;
 };
 
 struct SquareAdd
@@ -51,13 +54,47 @@ struct SquareAdd
     {
         return y + (x * x);
     }
+
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const
+    {
+        float y_ = type_convert<float>(y);
+        float x_ = type_convert<float>(x);
+        return type_convert<T>(y_ + (x_ * x_));
+    }
+
+    // For combining partial results
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T combine_partial_results(const T& partial1,
+                                                            const T& partial2) const
+    {
+        return partial1 + partial2; // Just add the partial sums, don't square again
+    }
+
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T combine_partial_results(T& partial1, T& partial2) const
+    {
+        float partial1_ = type_convert<float>(partial1);
+        float partial2_ = type_convert<float>(partial2);
+        return type_convert<T>(partial1_ + partial2_);
+    }
+
+    static constexpr bool requires_special_combine = true;
 };
 
 struct Max
 {
     template <typename T,
               typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
     {
         return numeric<T>::min();
@@ -65,18 +102,24 @@ struct Max
 
     template <typename T,
               typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
     {
         return max(y, x);
     }
+
+    static constexpr bool requires_special_combine = false;
 };
 
 struct AbsMax
 {
     template <typename T,
               typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
     {
         return numeric<T>::min();
@@ -84,11 +127,15 @@ struct AbsMax
 
     template <typename T,
               typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
     CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
     {
         return max(y, abs(x));
     }
+
+    static constexpr bool requires_special_combine = false;
 };
 
 } // namespace ReduceOp
diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp
index c3f1b7d221..b7329fcac7 100644
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -409,7 +409,13 @@ struct HostTensor
     }
 
     // void SetZero() { ck_tile::ranges::fill<T>(mData, 0); }
-    void SetZero() { std::fill(mData.begin(), mData.end(), 0); }
+    void SetZero()
+    {
+        if constexpr(std::is_same_v<T, e8m0_t>)
+            std::fill(mData.begin(), mData.end(), e8m0_t{1.f});
+        else
+            std::fill(mData.begin(), mData.end(), 0);
+    }
 
     template <typename F>
     void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
diff --git a/include/ck_tile/host/reference/reference_reduce.hpp b/include/ck_tile/host/reference/reference_reduce.hpp
index 8f8aa23670..9952b7b009 100644
--- a/include/ck_tile/host/reference/reference_reduce.hpp
+++ b/include/ck_tile/host/reference/reference_reduce.hpp
@@ -30,4 +30,82 @@ reference_reduce(const HostTensor<XDataType>& x_m_n, HostTensor<YDataType>& y_m,
 
     make_ParallelTensorFunctor(f, y_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
 }
+
+// Generic reference reduce for arbitrary dimensions
+template <
+    typename XDataType,
+    typename ComputeDataType,
+    typename YDataType,
+    typename ReduceOp,
+    typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to keep
+    typename ReduceDims> // Expected type: ck_tile::sequence<...> containing dimension indices to
+                         // reduce
+CK_TILE_HOST void reference_reduce(const HostTensor<XDataType>& x_tensor,
+                                   HostTensor<YDataType>& y_tensor,
+                                   ReduceOp reduce_op,
+                                   KeptDim kept_dim,
+                                   ReduceDims reduce_dims)
+{
+    const auto& x_lengths = x_tensor.mDesc.get_lengths();
+
+    // Calculate total kept elements (product of all kept dimension lengths)
+    index_t total_kept_elements = 1;
+    static_for<0, kept_dim.size(), 1>{}(
+        [&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
+
+    // Calculate total reduce elements (product of all reduce dimension lengths)
+    index_t total_reduce_elements = 1;
+    static_for<0, reduce_dims.size(), 1>{}(
+        [&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
+
+    auto f = [&](auto linear_kept_idx) {
+        ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();
+
+        // Convert linear kept index to multi-dimensional kept indices
+        std::vector<index_t> kept_indices(kept_dim.size());
+        index_t temp_kept = linear_kept_idx;
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) {
+            constexpr auto dim_idx = kept_dim.size() - 1 - i;
+            constexpr auto dim     = kept_dim.at(dim_idx);
+            const auto len         = x_lengths[dim];
+            kept_indices[dim_idx]  = temp_kept % len;
+            temp_kept /= len;
+        });
+
+        for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
+        {
+            // Convert linear reduce index to multi-dimensional reduce indices
+            std::vector<index_t> reduce_indices(reduce_dims.size());
+            index_t temp_reduce = reduce_idx;
+            static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
+                constexpr auto dim_idx  = reduce_dims.size() - 1 - i;
+                constexpr auto dim      = reduce_dims.at(dim_idx);
+                const auto len          = x_lengths[dim];
+                reduce_indices[dim_idx] = temp_reduce % len;
+                temp_reduce /= len;
+            });
+
+            // Build full input tensor indices by combining kept and reduce indices
+            std::vector<std::size_t> full_indices(x_lengths.size(), 0);
+            static_for<0, kept_dim.size(), 1>{}(
+                [&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
+            static_for<0, reduce_dims.size(), 1>{}(
+                [&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
+
+            // Access input tensor element
+            const auto v_a = type_convert<ComputeDataType>(x_tensor(full_indices));
+
+            v_acc = reduce_op(v_acc, v_a);
+        }
+
+        // Calculate output tensor index using kept indices
+        // The output tensor has the same structure as the kept dimensions
+        std::vector<std::size_t> y_indices(kept_dim.size());
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
+
+        y_tensor(y_indices) = type_convert<YDataType>(v_acc);
+    };
+
+    make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
+}
 } // namespace ck_tile
diff --git a/include/ck_tile/host/reference/reference_softmax.hpp b/include/ck_tile/host/reference/reference_softmax.hpp
index d86e879944..4e729c437d 100644
--- a/include/ck_tile/host/reference/reference_softmax.hpp
+++ b/include/ck_tile/host/reference/reference_softmax.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -14,7 +14,7 @@ CK_TILE_HOST void
 reference_softmax(const HostTensor<InputType>& x, HostTensor<OutputType>& y, index_t dim = -1)
 {
     index_t rank = x.get_num_of_dimension();
-    assert(rank == y.get_num_of_dimension());
+    assert(static_cast<std::size_t>(rank) == y.get_num_of_dimension());
     assert(dim == -1 || dim < rank);
 
     index_t target_dim  = dim == -1 ? (rank - 1) : dim;
diff --git a/include/ck_tile/host/reference/reference_topk.hpp b/include/ck_tile/host/reference/reference_topk.hpp
index 3d0404a2e5..0fc99a983a 100644
--- a/include/ck_tile/host/reference/reference_topk.hpp
+++ b/include/ck_tile/host/reference/reference_topk.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -38,8 +38,8 @@ CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
 {
     // rank must be the same
     index_t rank = x.get_num_of_dimension();
-    assert(rank == y_values.get_num_of_dimension());
-    assert(rank == y_indices.get_num_of_dimension());
+    assert(static_cast<std::size_t>(rank) == y_values.get_num_of_dimension());
+    assert(static_cast<size_t>(rank) == y_indices.get_num_of_dimension());
     assert(dim == -1 || dim < rank);
 
     index_t topk_dim     = dim == -1 ? (rank - 1) : dim;
@@ -47,7 +47,8 @@ CK_TILE_HOST void reference_topk(const HostTensor<DataType>& x,
     auto x_len           = x.get_lengths();
 
     assert(k <= topk_src_len);
-    assert(k == y_values.get_length(topk_dim) && k == y_indices.get_length(topk_dim));
+    assert(static_cast<size_t>(k) == y_values.get_length(topk_dim) &&
+           static_cast<size_t>(k) == y_indices.get_length(topk_dim));
 
     index_t n_parallel = x.get_element_size() / topk_src_len;
 
diff --git a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
index a89a190489..a4150e8d84 100644
--- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
+++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
@@ -49,9 +49,11 @@ struct BatchedTransposeKernel
 
     CK_TILE_HOST static constexpr auto GridSize(const Hargs& host_args)
     {
-        size_t grid_size_x = (host_args.height + host_args.dim_block_h - 1) / host_args.dim_block_h;
-        size_t grid_size_y = (host_args.width + host_args.dim_block_w - 1) / host_args.dim_block_w;
-        size_t grid_size_z = host_args.batch;
+        const size_t grid_size_x =
+            ck_tile::integer_divide_ceil(host_args.height, host_args.dim_block_h);
+        const size_t grid_size_y =
+            ck_tile::integer_divide_ceil(host_args.width, host_args.dim_block_w);
+        const size_t grid_size_z = host_args.batch;
         return dim3(grid_size_x, grid_size_y, grid_size_z);
     }
 
@@ -71,41 +73,43 @@ struct BatchedTransposeKernel
 
     CK_TILE_DEVICE void operator()(Kargs kargs) const
     {
-        static constexpr ck_tile::index_t kMPerBlock       = Problem::kMPerBlock;
-        static constexpr ck_tile::index_t kNPerBlock       = Problem::kNPerBlock;
-        static constexpr bool kPadM                        = Problem::kPadM;
-        static constexpr bool kPadN                        = Problem::kPadN;
-        static constexpr ck_tile::index_t VectorSizeInput  = Problem::VectorSizeInput;
-        static constexpr ck_tile::index_t VectorSizeOutput = Problem::VectorSizeOutput;
+        static constexpr ck_tile::index_t kMPerBlock         = Problem::kMPerBlock;
+        static constexpr ck_tile::index_t kNPerBlock         = Problem::kNPerBlock;
+        static constexpr bool kPadM                          = Problem::kPadM;
+        static constexpr bool kPadN                          = Problem::kPadN;
+        static constexpr ck_tile::index_t VectorSizeInput    = Problem::VectorSizeInput;
+        static constexpr ck_tile::index_t VectorStrideInput  = 1;
+        static constexpr ck_tile::index_t VectorSizeOutput   = Problem::VectorSizeOutput;
+        static constexpr ck_tile::index_t VectorStrideOutput = 1;
 
-        const auto iM   = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
-        const auto iN   = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock);
-        const auto iDim = blockIdx.z;
+        const auto iM     = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
+        const auto iN     = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock);
+        const auto offset = __builtin_amdgcn_readfirstlane(blockIdx.z * kargs.height * kargs.width);
 
         const auto x_m_n = [&]() {
             const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                static_cast<const Type*>(kargs.p_input) + iDim * kargs.dim_stride,
+                static_cast<const Type*>(kargs.p_input) + offset,
                 make_tuple(kargs.height, kargs.width),
                 make_tuple(kargs.width, 1),
                 number<VectorSizeInput>{},
-                number<1>{});
+                number<VectorStrideInput>{});
 
             return pad_tensor_view(x_dram_naive,
                                    make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
-                                   sequence<kPadN, kPadM>{});
+                                   sequence<kPadM, kPadN>{});
         }();
 
         const auto y_n_m = [&]() {
             const auto y_dram_naive = make_naive_tensor_view<address_space_enum::global>(
-                static_cast<Type*>(kargs.p_output) + iDim * kargs.dim_stride,
+                static_cast<Type*>(kargs.p_output) + offset,
                 make_tuple(kargs.width, kargs.height),
                 make_tuple(kargs.height, 1),
                 number<VectorSizeOutput>{},
-                number<1>{});
+                number<VectorStrideOutput>{});
 
             return pad_tensor_view(y_dram_naive,
                                    make_tuple(number<kNPerBlock>{}, number<kMPerBlock>{}),
-                                   sequence<kPadM, kPadN>{});
+                                   sequence<kPadN, kPadM>{});
         }();
 
         auto x_block_window = make_tile_window(
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
index e344c24bf5..3b8d5a142e 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_common_policy.hpp
@@ -15,15 +15,15 @@ struct BatchedTransposeCommonPolicy
     template <typename Problem>
     CK_TILE_DEVICE static constexpr auto MakeInputDistribution()
     {
-        constexpr index_t BlockSize         = Problem::kBlockSize;
-        constexpr index_t LeadDimPerBlock   = Problem::kMPerBlock;
-        constexpr index_t SecondDimPerBlock = Problem::kNPerBlock;
+        constexpr index_t kBlockSize         = Problem::kBlockSize;
+        constexpr index_t kLeadDimPerBlock   = Problem::kNPerBlock;
+        constexpr index_t kSecondDimPerBlock = Problem::kMPerBlock;
 
-        constexpr index_t kVectorSize = Problem::VectorSizeOutput;
-
-        using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                      SecondDimPerBlock,
-                                                                      LeadDimPerBlock,
+        constexpr index_t kVectorSize = Problem::VectorSizeInput;
+        static_assert((kLeadDimPerBlock * kVectorSize) % kBlockSize == 0, "");
+        using TileEncodingPattern = TileDistributionEncodingPattern2D<kBlockSize,
+                                                                      kSecondDimPerBlock,
+                                                                      kLeadDimPerBlock,
                                                                       kVectorSize,
                                                                       TileAccessPattern>;
         return TileEncodingPattern::Make2DStaticTileDistribution();
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp
index 491db37564..45803ae2da 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_lds_problem.hpp
@@ -18,19 +18,19 @@ struct BatchedTransposeLdsProblem
 {
     using DataType = remove_cvref_t<DataType_>;
 
-    static constexpr index_t kRowWarps_    = NumWarps::at(number<1>{});
-    static constexpr index_t kColWarps_    = NumWarps::at(number<0>{});
+    static constexpr index_t kRowWarps_    = NumWarps::at(number<0>{});
+    static constexpr index_t kColWarps_    = NumWarps::at(number<1>{});
     static constexpr index_t kBlockSize_   = get_warp_size() * kRowWarps_ * kColWarps_;
-    static constexpr index_t kRowPerBlock_ = BlockTile::at(number<1>{});
-    static constexpr index_t kColPerBlock_ = BlockTile::at(number<0>{});
+    static constexpr index_t kRowPerBlock_ = BlockTile::at(number<0>{});
+    static constexpr index_t kColPerBlock_ = BlockTile::at(number<1>{});
 
     static constexpr index_t kBlockSize = kBlockSize_;
     // warps per block
-    static constexpr index_t kLeadNumWarps   = kRowWarps_;
-    static constexpr index_t kSecondNumWarps = kColWarps_;
+    static constexpr index_t kLeadNumWarps   = kColWarps_;
+    static constexpr index_t kSecondNumWarps = kRowWarps_;
 
-    static constexpr index_t kLeadSizePerBlock   = kRowPerBlock_;
-    static constexpr index_t kSecondSizePerBlock = kColPerBlock_;
+    static constexpr index_t kLeadSizePerBlock   = kColPerBlock_;
+    static constexpr index_t kSecondSizePerBlock = kRowPerBlock_;
 
     static constexpr index_t kQuadrantLeadDim   = LaneGroupTransposeTraits<DataType>::kleadDim;
     static constexpr index_t kQuadrantSecondDim = LaneGroupTransposeTraits<DataType>::ksecondDim;
@@ -60,8 +60,8 @@ struct BatchedTransposeLdsProblem
     static constexpr bool kPadM = kPadM_;
     static constexpr bool kPadN = kPadN_;
 
-    static constexpr auto kMPerBlock = kLeadSizePerBlock;
-    static constexpr auto kNPerBlock = kSecondSizePerBlock;
+    static constexpr auto kMPerBlock = kSecondSizePerBlock;
+    static constexpr auto kNPerBlock = kLeadSizePerBlock;
 
     // 128-bit is the max single-instruction bandwidth for load/store
     static constexpr index_t MaxLoadStoreSize = 16;
diff --git a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
index 5238fecdc5..e6bbc709ea 100644
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
@@ -19,8 +19,8 @@ struct BatchedTransposePolicy : public BatchedTransposeCommonPolicy
         constexpr index_t VecLoadSize = Problem::VectorSizeOutput;
 
         using TileEncodingPattern = TileDistributionEncodingPattern2D<BlockSize,
-                                                                      NPerBlock,
                                                                       MPerBlock,
+                                                                      NPerBlock,
                                                                       VecLoadSize,
                                                                       TileAccessPattern>;
         return TileEncodingPattern::MakeShuffled2DStaticTileDistribution();
diff --git a/include/ck_tile/ops/fmha.hpp b/include/ck_tile/ops/fmha.hpp
index 30bea193b7..313de5f29a 100644
--- a/include/ck_tile/ops/fmha.hpp
+++ b/include/ck_tile/ops/fmha.hpp
@@ -24,8 +24,8 @@
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp"
-#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline.hpp"
 #include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline_default_policy.hpp"
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index ce3bf8fe8d..8b184b18f3 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -52,8 +52,6 @@ struct FmhaBwdDQDKDVKernel
     using BiasGradDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::BiasGradDataType>;
 
     static constexpr bool kIsGroupMode = FmhaPipeline::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ  = FmhaPipeline::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK  = FmhaPipeline::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV;
     static constexpr auto BiasEnum     = FmhaPipeline::BiasEnum;
@@ -85,8 +83,6 @@ struct FmhaBwdDQDKDVKernel
         #define _TS_  std::to_string
         auto pn = [&] () {
             std::string n;
-            if (kPadSeqLenQ) n += "s";
-            if (kPadSeqLenK) n += "sk";
             if (kPadHeadDimQ) n += "d";
             if (kPadHeadDimV) n += "dv";
             return n.empty() ? n : std::string("p") + n; }();
@@ -100,7 +96,7 @@ struct FmhaBwdDQDKDVKernel
             "r" + _TS_(gbr4::at(ck_tile::number<0>{})) + "x" + _TS_(gbr4::at(ck_tile::number<1>{})) + "x" + _TS_(gbr4::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt0::at(ck_tile::number<0>{})) + "x" + _TS_(gwt0::at(ck_tile::number<1>{})) + "x" + _TS_(gwt0::at(ck_tile::number<2>{})) + "_" +
             "w" + _TS_(gwt1::at(ck_tile::number<0>{})) + "x" + _TS_(gwt1::at(ck_tile::number<1>{})) + "x" + _TS_(gwt1::at(ck_tile::number<2>{})) + "_" +
-            ("o" + _TS_(kBlockPerCu) + "_") + _SS_(FmhaPipeline::name) + (pn.empty() ? "_npad" : "_" + pn) +
+            ("o" + _TS_(kBlockPerCu)) + (pn.empty() ? "_npad" : "_" + pn) +
             (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
             (kHasBiasGrad ? "_dbias" : "_ndbias") + (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kHasDropout ? "_dropout" : "_ndropout" ) +
             (kIsStoreRandval ? "_storerandval" : "" ) + (kIsDeterministic ? "_deterministic" : "_ndeterministic" );
@@ -1221,7 +1217,7 @@ struct FmhaBwdDQDKDVKernel
         const auto q_dram = pad_tensor_view(
             q_dram_naive,
             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-            sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+            sequence<false, kPadHeadDimQ>{});
 
         const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
             k_ptr,
@@ -1232,7 +1228,7 @@ struct FmhaBwdDQDKDVKernel
         const auto k_dram = pad_tensor_view(
             k_dram_naive,
             make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-            sequence<kPadSeqLenK, kPadHeadDimQ>{});
+            sequence<false, kPadHeadDimQ>{});
 
         const auto v_dram = [&]() {
             const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
@@ -1244,22 +1240,15 @@ struct FmhaBwdDQDKDVKernel
             return pad_tensor_view(
                 v_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kVHeaddim>{}),
-                sequence<kPadSeqLenK, kPadHeadDimV>{});
+                sequence<false, kPadHeadDimV>{});
         }();
 
-        const auto lse_dram = [&]() {
-            const auto lse_dram_naive = make_naive_tensor_view_packed<address_space_enum::global>(
-                lse_ptr, make_tuple(kargs.seqlen_q), number<1>{});
-            return pad_tensor_view(
-                lse_dram_naive, make_tuple(number<FmhaPipeline::kM0>{}), sequence<kPadSeqLenQ>{});
-        }();
+        // lse and d should be fine to read unpaded data as they are not on the reduction dimension
+        const auto lse_dram = make_naive_tensor_view_packed<address_space_enum::global>(
+            lse_ptr, make_tuple(kargs.seqlen_q), number<FmhaPipeline::kM0>{});
 
-        const auto d_dram = [&]() {
-            const auto d_dram_naive = make_naive_tensor_view_packed<address_space_enum::global>(
-                d_ptr, make_tuple(kargs.seqlen_q), number<1>{});
-            return pad_tensor_view(
-                d_dram_naive, make_tuple(number<FmhaPipeline::kM0>{}), sequence<kPadSeqLenQ>{});
-        }();
+        const auto d_dram = make_naive_tensor_view_packed<address_space_enum::global>(
+            d_ptr, make_tuple(kargs.seqlen_q), number<FmhaPipeline::kM0>{});
 
         const auto do_dram_naive = make_naive_tensor_view<address_space_enum::global>(
             do_ptr,
@@ -1270,7 +1259,7 @@ struct FmhaBwdDQDKDVKernel
         const auto do_dram = pad_tensor_view(
             do_dram_naive,
             make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kVHeaddim>{}),
-            sequence<kPadSeqLenQ, kPadHeadDimV>{});
+            sequence<false, kPadHeadDimV>{});
 
         auto q_dram_window = make_tile_window(
             q_dram,
@@ -1313,7 +1302,7 @@ struct FmhaBwdDQDKDVKernel
                     return pad_tensor_view(
                         dq_acc_dram_naive,
                         make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                        sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                        sequence<false, kPadHeadDimQ>{});
                 }();
 
                 return make_tile_window(
@@ -1341,7 +1330,7 @@ struct FmhaBwdDQDKDVKernel
                     return pad_tensor_view(
                         dq_acc_dram_naive,
                         make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                        sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                        sequence<false, kPadHeadDimQ>{});
                 }();
 
                 return make_tile_window(
@@ -1376,9 +1365,8 @@ struct FmhaBwdDQDKDVKernel
                         number<FmhaPipeline::kAlignmentBias>{},
                         number<1>{});
 
-                    return pad_tensor_view(bias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    return pad_tensor_view(
+                        bias_dram_naive, bias_dram_window_lengths, sequence<false, true>{});
                 }();
 
                 return make_tile_window(bias_dram, bias_dram_window_lengths, {0, i_n0});
@@ -1406,9 +1394,8 @@ struct FmhaBwdDQDKDVKernel
                             number<FmhaPipeline::kAlignmentBias>{},
                             number<1>{});
 
-                    return pad_tensor_view(dbias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    return pad_tensor_view(
+                        dbias_dram_naive, bias_dram_window_lengths, sequence<false, true>{});
                 }();
 
                 return make_tile_window(dbias_dram, bias_dram_window_lengths, {0, i_n0});
@@ -1495,9 +1482,8 @@ struct FmhaBwdDQDKDVKernel
                             number<1>{},
                             number<1>{});
 
-                    return pad_tensor_view(randval_dram_naive,
-                                           randval_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    return pad_tensor_view(
+                        randval_dram_naive, randval_dram_window_lengths, sequence<false, true>{});
                 }();
 
                 return make_tile_window(randval_dram, randval_dram_window_lengths, {0, i_n0});
@@ -1550,7 +1536,7 @@ struct FmhaBwdDQDKDVKernel
             return pad_tensor_view(
                 dk_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kQKHeaddim>{}),
-                sequence<kPadSeqLenK, kPadHeadDimQ>{});
+                sequence<false, kPadHeadDimQ>{});
         }();
 
         auto dv_dram = [&]() {
@@ -1564,7 +1550,7 @@ struct FmhaBwdDQDKDVKernel
             return pad_tensor_view(
                 dv_dram_naive,
                 make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kVHeaddim>{}),
-                sequence<kPadSeqLenK, kPadHeadDimV>{});
+                sequence<false, kPadHeadDimV>{});
         }();
 
         auto dk_dram_window = make_tile_window(
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
index 8a13c0b060..1f11569533 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
@@ -49,8 +49,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
     static constexpr index_t kVHeaddim  = BlockFmhaShape::kVHeaddim;
 
     static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ      = Problem::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK      = Problem::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
     static constexpr auto BiasEnum         = Problem::BiasEnum;
@@ -72,8 +70,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
         kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad<Problem>();
     static constexpr index_t kAlignmentVGrad =
         kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad<Problem>();
-    static constexpr index_t kAlignmentBias =
-        kPadSeqLenK ? 1 : Policy::template GetTransposedAlignmentBias<Problem>();
+    static constexpr index_t kAlignmentBias = 1;
 
     static constexpr const char* name = "kr_ktr_vr";
 
@@ -554,7 +551,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
                 });
             }
 
-            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
             {
                 bool need_perpixel_check = mask.IsEdgeTile(
                     seqlen_q_step, k_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
index c88b058d32..967fe2362d 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
@@ -49,8 +49,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
     static constexpr index_t kVHeaddim  = BlockFmhaShape::kVHeaddim;
 
     static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
-    static constexpr bool kPadSeqLenQ      = Problem::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK      = Problem::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ     = Problem::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV     = Problem::kPadHeadDimV;
     static constexpr auto BiasEnum         = Problem::BiasEnum;
@@ -72,8 +70,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
         kPadHeadDimQ ? 1 : Policy::template GetAlignmentKGrad<Problem>();
     static constexpr index_t kAlignmentVGrad =
         kPadHeadDimV ? 1 : Policy::template GetAlignmentVGrad<Problem>();
-    static constexpr index_t kAlignmentBias =
-        kPadSeqLenK ? 1 : Policy::template GetTransposedAlignmentBias<Problem>();
+    static constexpr index_t kAlignmentBias = 1;
 
     static constexpr const char* name = "kr_ktr_vr_iglp";
 
@@ -590,7 +587,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
                 });
             }
 
-            if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
             {
                 bool need_perpixel_check = mask.IsEdgeTile(
                     seqlen_q_step, k_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
@@ -849,7 +845,6 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
             });
         }
 
-        if constexpr(kPadSeqLenK || FmhaMask::IsMasking)
         {
             bool need_perpixel_check = mask.IsEdgeTile(
                 seqlen_q_step, k_origin.at(number<0>{}), number<kM0>{}, number<kN0>{});
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
new file mode 100644
index 0000000000..80c311de86
--- /dev/null
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_selector.hpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp"
+#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp"
+
+namespace ck_tile {
+
+template <typename Problem>
+class BlockFmhaBwdDQDKDVPipelineSelector
+{
+    static constexpr bool has_dpad = Problem::Traits::kPadHeadDimQ || Problem::Traits::kPadHeadDimV;
+
+    public:
+    using type = std::conditional_t<has_dpad,
+                                    BlockFmhaBwdDQDKDVPipelineKRKTRVR<Problem>,
+                                    BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP<Problem>>;
+};
+
+template <typename Problem>
+class BlockFmhaBwdDQDKDVPipeline : public BlockFmhaBwdDQDKDVPipelineSelector<Problem>::type
+{
+    public:
+    static constexpr const char* name = "auto";
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp
deleted file mode 100644
index 27f58ef2f8..0000000000
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_enum.hpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-namespace ck_tile {
-
-// This class is used for codegen pattern matching
-enum class BlockFmhaBwdPipelineEnum
-{
-    KRKTRVR_IGLP = 0,
-    KRKTRVR,
-};
-
-} // namespace ck_tile
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
index c4c4a745a7..f6c79c7db6 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_problem.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -55,13 +55,13 @@ struct BlockFmhaBwdPipelineProblem
     static constexpr bool kIsDeterministic = kIsDeterministic_;
 
     // attributes from traits
-    static constexpr bool kPadSeqLenQ    = Traits::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK    = Traits::kPadSeqLenK;
     static constexpr bool kPadHeadDimQ   = Traits::kPadHeadDimQ;
     static constexpr bool kPadHeadDimV   = Traits::kPadHeadDimV;
     static constexpr auto BiasEnum       = Traits::BiasEnum;
     static constexpr bool kHasBiasGrad   = Traits::kHasBiasGrad;
     static constexpr index_t kBlockPerCu = Traits::kBlockPerCu;
+    static_assert(!Traits::kPadSeqLenQ, "BlockFmhaBwdPipelineProblem does not need kPadSeqLenQ");
+    static_assert(!Traits::kPadSeqLenK, "BlockFmhaBwdPipelineProblem does not need kPadSeqLenQ");
 };
 
 template <typename ODataType_,
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index 921ea11720..477a87d42f 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -252,13 +252,6 @@ struct GroupedGemmKernel
         return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
     }
 
-    CK_TILE_DEVICE void Run(const GemmTransKernelArg& kargs,
-                            const tuple<index_t, index_t>& block_idx_2d,
-                            const index_t block_idx_z) const
-    {
-        Run(kargs.group_karg, block_idx_2d, block_idx_z);
-    }
-
     CK_TILE_DEVICE void Run(const UniversalGemmKernelArgs<>& kargs,
                             const tuple<index_t, index_t>& block_idx_2d,
                             const index_t block_idx_z) const
@@ -277,24 +270,56 @@ struct GroupedGemmKernel
         CDataType* c_ptr = static_cast<CDataType*>(kargs.e_ptr);
 
         // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
+        __shared__ char smem_ptr_0[GetSmemSize()];
 
-        if constexpr(UsePersistentKernel)
+        if constexpr(GemmPipeline::DoubleSmemBuffer == true)
         {
-            RunGemmWithPipelineSelection(
-                a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+            __shared__ char smem_ptr_1[GetSmemSize()];
+            if constexpr(UsePersistentKernel)
+            {
+                RunGemmWithPipelineSelection2LDS(a_ptr,
+                                                 b_ptr,
+                                                 c_ptr,
+                                                 smem_ptr_0,
+                                                 smem_ptr_1,
+                                                 kargs,
+                                                 splitk_batch_offset,
+                                                 i_m,
+                                                 i_n);
+            }
+            else
+            {
+                Base::RunGemm2LDS({a_ptr},
+                                  {b_ptr},
+                                  {/*ds_ptr*/},
+                                  c_ptr,
+                                  smem_ptr_0,
+                                  smem_ptr_1,
+                                  kargs,
+                                  splitk_batch_offset,
+                                  i_m,
+                                  i_n);
+            }
         }
         else
         {
-            Base::RunGemm({a_ptr},
-                          {b_ptr},
-                          {/*ds_ptr*/},
-                          c_ptr,
-                          smem_ptr,
-                          kargs,
-                          splitk_batch_offset,
-                          i_m,
-                          i_n);
+            if constexpr(UsePersistentKernel)
+            {
+                RunGemmWithPipelineSelection(
+                    a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+            }
+            else
+            {
+                Base::RunGemm({a_ptr},
+                              {b_ptr},
+                              {/*ds_ptr*/},
+                              c_ptr,
+                              smem_ptr_0,
+                              kargs,
+                              splitk_batch_offset,
+                              i_m,
+                              i_n);
+            }
         }
     }
 
@@ -358,6 +383,69 @@ struct GroupedGemmKernel
             c_block_window, c_block_tile, d_block_window, smem_ptr_0);
     }
 
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @note The GEMM pipeline is selected in-kernel based on the number of K-loops
+     *       and the tail-number. This is needed for the persistent tile-loop when
+     *       we didn't have access to the K dimension on the host.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
+     * @param smem_ptr_1 The second start memory pointer of the shared memory block.
+     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k
+     * batch.
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     */
+    CK_TILE_DEVICE static void
+    RunGemmWithPipelineSelection2LDS(const ADataType* a_ptr,
+                                     const BDataType* b_ptr,
+                                     CDataType* c_ptr,
+                                     void* __restrict__ smem_ptr_0,
+                                     void* __restrict__ smem_ptr_1,
+                                     const UniversalGemmKernelArgs<>& kargs,
+                                     const typename Base::SplitKBatchOffset& splitk_batch_offset,
+                                     const index_t block_idx_m,
+                                     const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            Base::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, splitk_batch_offset);
+
+        const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows =
+            Base::MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+        const auto& a_block_window = gemm_tile_windows.at(Base::I0);
+        const auto& b_block_window = gemm_tile_windows.at(Base::I1);
+        const auto& d_block_window = gemm_tile_windows.at(Base::I2);
+
+        // Get hot-loop and tail configuration
+        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop);
+        const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        // Run GEMM pipeline
+        const auto& c_block_tile = GemmPipeline{}.template operator()(a_block_window[Base::I0],
+                                                                      b_block_window[Base::I0],
+                                                                      num_loop,
+                                                                      has_hot_loop,
+                                                                      tail_num,
+                                                                      smem_ptr_0,
+                                                                      smem_ptr_1);
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(Base::I3);
+        EpiloguePipeline{}.template
+        operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
+            c_block_window, c_block_tile, d_block_window, smem_ptr_0);
+    }
+
     CK_TILE_DEVICE index_t FindGroupId(const GemmTransKernelArg* gemm_desc_ptr,
                                        index_t block_id,
                                        index_t group_count) const
@@ -401,7 +489,7 @@ struct GroupedGemmKernel
             kargs.group_karg.M,
             kargs.group_karg.N,
             (block_id - kargs.block_start) % grid_size_2d);
-        Run(kargs, block_idx_2d, (block_id - kargs.block_start) / grid_size_2d);
+        Run(kargs.group_karg, block_idx_2d, (block_id - kargs.block_start) / grid_size_2d);
     }
 
     // For persistent kernels
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index 50a2110935..ce800d03ab 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -18,12 +18,14 @@ struct BaseGemmPipelineAgBgCrCompV4
     static constexpr index_t PrefillStages   = 1;
     static constexpr index_t GlobalBufferNum = 1;
 
-    CK_TILE_HOST static constexpr bool BlockHasHotloop(index_t num_loop)
+    static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel;
+
+    CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t num_loop)
     {
         return num_loop > PrefetchStages;
     }
 
-    CK_TILE_HOST static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
+    CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
     {
         if(num_loop % PrefetchStages == 1)
         {
diff --git a/include/ck_tile/ops/reduce.hpp b/include/ck_tile/ops/reduce.hpp
index 80ead84e85..a6721c9305 100644
--- a/include/ck_tile/ops/reduce.hpp
+++ b/include/ck_tile/ops/reduce.hpp
@@ -5,8 +5,11 @@
 
 #include "ck_tile/ops/reduce/block/block_reduce.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
-#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
 #include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
+#include "ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp"
+#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp"
+#include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp"
+#include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
 #include "ck_tile/ops/common/utils.hpp"
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
index 62c9944bd2..849fa6c252 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -7,20 +7,55 @@
 
 namespace ck_tile {
 
+// BlockReduce2d implements a hierarchical 2D reduction operator that reduces data along the second
+// dimension using a user-specified reduction function.
+//
+// The reduction is performed in a three-stage hierarchical approach:
+//
+// STAGE 1: Thread-level reduction (BlockReduce2d)
+// ===============================================
+// - Each thread processes multiple elements from the input tensor within its assigned data
+// partition
+// - Reduction is performed locally within each thread by iterating over assigned elements
+// - ReducePacksPerXDim controls how many elements sweep_tile processes in one iteration per
+// dimension
+//   (e.g., {1,1} = 1 element at a time from each dimension, {2,4} = 2 from dim0, 4 from dim1)
+// - Results are accumulated into a thread-local output tensor stored in registers
+// - The output tensor distribution is derived from the input tensor's distribution using
+//   make_reduce_tile_distribution_encoding() to handle dimension reduction
+//
+// STAGE 2: Warp-level reduction (BlockReduce2dSync)
+// ================================================
+// - Performs inter-thread reduction within each warp
+// - Uses warp shuffle operations to exchange data between threads in the same warp
+// - Implements a tree-reduction pattern with power-of-2 stages
+// - Only reduces along dimensions that map to lane IDs within the warp
+//
+// STAGE 3: Cross-warp reduction (BlockReduce2dCrossWarpSync)
+// ========================================================
+// - Performs reduction across multiple warps within the same thread block
+// - Uses shared memory (LDS) to facilitate data exchange between warps
+// - Each warp's lane-0 thread stores its partial results to shared memory
+// - All threads participate in loading and reducing data from shared memory
+// - Implements block-level synchronization to ensure memory consistency
+
+// BlockReduce2d: Thread-level reduction (Stage 1)
 template <typename Problem_, typename Policy_ = void>
 struct BlockReduce2d
 {
-    // in-thread reduction
+    // Thread-level reduction implementation
     using Problem         = remove_cvref_t<Problem_>;
     using XDataType       = typename Problem::XDataType;
     using ComputeDataType = typename Problem::ComputeDataType;
 
     CK_TILE_DEVICE constexpr BlockReduce2d() {}
 
-    template <typename XDistributedTensor_,
-              typename YDistributedTensor_,
-              typename ReduceFunc,
-              typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
+    template <
+        typename XDistributedTensor_,
+        typename YDistributedTensor_,
+        typename ReduceFunc,
+        typename ReducePacksPerXDim =
+            uniform_sequence_gen_t<2, 1>> // {1,1} = process 1 element at a time from each dimension
     CK_TILE_DEVICE void operator()(const XDistributedTensor_& x_tensor,
                                    YDistributedTensor_& y_tensor,
                                    const ReduceFunc& reduce_func,
@@ -33,6 +68,7 @@ struct BlockReduce2d
                     y_tensor(idx_0), ck_tile::type_convert<ComputeDataType>(x_tensor[idx_])...);
             },
             ReducePacksPerXDim{});
+
 #if 0
         constexpr auto I0 = number<0>{};
         constexpr auto I1 = number<1>{};
@@ -75,6 +111,8 @@ struct BlockReduce2d
         return tensor;
     }
 
+    // uniform_sequence_gen_t<NSize, Value> generates sequence of NSize elements filled with Value
+    // e.g., uniform_sequence_gen_t<2, 1> → {1, 1} and uniform_sequence_gen_t<3, 4> → {4, 4, 4}
     template <typename XDistributedTensor_,
               typename ReduceFunc,
               typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
@@ -91,6 +129,7 @@ struct BlockReduce2d
     }
 };
 
+// BlockReduce2dSync: Warp-level reduction (Stage 2)
 template <typename Problem_, typename Policy_ = void>
 struct BlockReduce2dSync
 {
@@ -145,8 +184,15 @@ struct BlockReduce2dSync
                         // pull data from remote lane
                         const auto v_remote = warp_shuffle(v_local, src_lane);
 
-                        // reduce
-                        v_local = reduce_func(v_local, v_remote);
+                        // For reduce, use combine_partial_results for operations that require it
+                        if constexpr(ReduceFunc::requires_special_combine)
+                        {
+                            v_local = reduce_func.combine_partial_results(v_local, v_remote);
+                        }
+                        else
+                        {
+                            v_local = reduce_func(v_local, v_remote);
+                        }
                     });
                 }
             });
@@ -157,6 +203,7 @@ struct BlockReduce2dSync
     }
 };
 
+// BlockReduce2dCrossWarpSync: Cross-warp reduction (Stage 3)
 template <typename Problem_, typename Policy_ = void>
 struct BlockReduce2dCrossWarpSync
 {
@@ -263,8 +310,15 @@ struct BlockReduce2dCrossWarpSync
                 constexpr auto i_1      = number<i_1_n1 + 1>{};
                 const DataType v_remote = all_scratch[i_0 * num_reduce_warps + i_1];
 
-                // reduce
-                v_local = reduce_func(v_local, v_remote);
+                // For reduce, use combine_partial_results for operations that require it
+                if constexpr(ReduceFunc::requires_special_combine)
+                {
+                    v_local = reduce_func.combine_partial_results(v_local, v_remote);
+                }
+                else
+                {
+                    v_local = reduce_func(v_local, v_remote);
+                }
             });
 
             y_tensor.get_thread_buffer()(i_0) = v_local;
diff --git a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
new file mode 100644
index 0000000000..f65487ea6e
--- /dev/null
+++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce.hpp"
+#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp"
+
+// Reduce2d Kernel:
+// =======================================
+// This kernel implements a 2D reduction operation that reduces data along the second dimension
+// of a matrix. The reduction is performed in multiple hierarchical stages.
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = Reduce2dDefaultPolicy>
+struct Reduce
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
+
+    private:
+    // Helper function to calculate optimal vector size for input tensor
+    template <typename InputShape, typename ReduceDims>
+    static constexpr index_t CalculateInputVectorSize()
+    {
+        using S                                   = typename Problem::BlockShape;
+        constexpr index_t memory_vector_size      = 16 / sizeof(XDataType);
+        constexpr index_t thread_tile_vector_size = S::ThreadTile_N;
+
+        // Check if innermost reduce dimension is the last dimension (stride 1).
+        constexpr auto innermost_reduce_dim    = ReduceDims{}.at(number<ReduceDims{}.size() - 1>{});
+        constexpr bool is_innermost_contiguous = (innermost_reduce_dim == InputShape{}.size() - 1);
+
+        // If innermost reduce dimension is not the last dim (not contiguous), limit vectorization
+        constexpr index_t stride_based_vector_size =
+            is_innermost_contiguous ? ck_tile::min(memory_vector_size, thread_tile_vector_size) : 1;
+
+        return stride_based_vector_size;
+    }
+
+    // Helper function to calculate optimal vector size for output tensor
+    static constexpr index_t CalculateOutputVectorSize()
+    {
+        using S                                   = typename Problem::BlockShape;
+        constexpr index_t memory_vector_size      = 16 / sizeof(YDataType);
+        constexpr index_t thread_tile_vector_size = S::ThreadTile_M;
+        constexpr index_t vector_size = ck_tile::min(memory_vector_size, thread_tile_vector_size);
+
+        return vector_size;
+    }
+
+    public:
+    template <typename InputShape, typename InputStrides, typename KeptDim, typename ReduceDims>
+    CK_TILE_DEVICE void operator()(const XDataType* p_x,
+                                   YDataType* p_y,
+                                   InputShape input_shape,
+                                   InputStrides input_strides,
+                                   KeptDim kept_dim,
+                                   ReduceDims reduce_dims) const
+    {
+        using S       = typename Problem::BlockShape;
+        const auto iM = get_block_id() * S::Block_M;
+
+        static_assert(kept_dim.size() + reduce_dims.size() == InputShape::size(),
+                      "Size of kept dimensions + reduced dimensions must equal input tensor rank");
+
+        // Extract lengths based on kept and reduced dimensions
+        const auto kept_lens = [&]() {
+            return generate_tuple([&](auto I) { return input_shape.at(number<kept_dim.at(I)>{}); },
+                                  number<kept_dim.size()>{});
+        }();
+        const auto reduce_lens = [&]() {
+            return generate_tuple(
+                [&](auto I) { return input_shape.at(number<reduce_dims.at(I)>{}); },
+                number<reduce_dims.size()>{});
+        }();
+
+        const auto kept_merge_transform   = make_merge_transform(kept_lens);
+        const auto reduce_merge_transform = make_merge_transform(reduce_lens);
+
+        auto reduce_func = typename Problem::ReduceOp{};
+        const XDataType custom_padding_value =
+            type_convert<XDataType>(reduce_func.template GetIdentityValue<ComputeDataType>());
+
+        // Calculate optimal vector size for input tensor
+        constexpr auto x_tensor_vector_size = CalculateInputVectorSize<InputShape, ReduceDims>();
+
+        // Create input tensor view with custom padding value
+        auto desc = make_naive_tensor_descriptor(
+            input_shape, input_strides, number<x_tensor_vector_size>{}, number<1>{});
+
+        // Create buffer view with custom padding value
+        auto buffer_view = make_buffer_view<address_space_enum::global>(
+            p_x, desc.get_element_space_size(), custom_padding_value);
+
+        // Create tensor view with custom padding
+        const auto x_tensor = tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
+        const auto transformed_x_tensor = pad_tensor_view(
+            transform_tensor_view(x_tensor,
+                                  make_tuple(kept_merge_transform, reduce_merge_transform),
+                                  make_tuple(kept_dim, reduce_dims),
+                                  make_tuple(sequence<0>{}, sequence<1>{})),
+            make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+            sequence<0, 1>{});
+
+        // Calculate strides for output tensor based on its own dimensions
+        const auto kept_strides = [&]() {
+            return generate_tuple(
+                [&](auto I) {
+                    // Calculate stride for dimension I as product of all following dimensions
+                    index_t stride = 1;
+                    static_for<I + 1, kept_dim.size(), 1>{}(
+                        [&](auto J) { stride *= kept_lens.at(number<J>{}); });
+                    return stride;
+                },
+                number<kept_dim.size()>{});
+        }();
+
+        // Calculate optimal vector size for output tensor
+        constexpr auto y_tensor_vector_size = CalculateOutputVectorSize();
+
+        const auto y_m = make_naive_tensor_view<address_space_enum::global>(
+            p_y, kept_lens, kept_strides, number<y_tensor_vector_size>{}, number<1>{});
+
+        // Transform output tensor to 1D merged view
+        // This creates a view compatible with the 2D reduction pattern
+        const auto y_merged = transform_tensor_view(
+            y_m,
+            make_tuple(kept_merge_transform),
+            make_tuple(typename arithmetic_sequence_gen<0, kept_dim.size(), 1>::type{}),
+            make_tuple(sequence<0>{}));
+
+        auto x_window = make_tile_window(transformed_x_tensor,
+                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
+                                         {iM, 0},
+                                         Policy::template MakeXBlockTileDistribution<Problem>());
+
+        auto y_window = make_tile_window(y_merged, make_tuple(number<S::Block_M>{}), {iM});
+
+        __shared__ char smem[Policy::template GetSmemSize<Problem>()];
+
+        // Get the merged dimension size from the transformed tensor
+        const auto merged_reduce_len =
+            transformed_x_tensor.get_tensor_descriptor().get_lengths().at(number<1>{});
+        index_t num_n_tile_iteration =
+            __builtin_amdgcn_readfirstlane(integer_divide_ceil(merged_reduce_len, S::Block_N));
+
+        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
+        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
+        auto block_reduce2d_cross_warp_sync =
+            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
+
+        using XTensorType = decltype(load_tile(x_window));
+        auto y_compute    = block_reduce2d.template MakeYBlockTile<XTensorType>();
+        set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
+
+        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        {
+            const auto x = load_tile(x_window);
+            block_reduce2d(x, y_compute, reduce_func);
+            move_tile_window(x_window, {0, S::Block_N});
+        }
+
+        block_reduce2d_sync(y_compute, reduce_func);
+        block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func);
+
+        store_tile(y_window, cast_tile<YDataType>(y_compute));
+    }
+
+    /// @brief Validates if the given arguments are supported by the 2D reduction kernel.
+    ///
+    /// @param y_continous_dim Size of the continuous dimension of the output tensor.
+    ///                        Must be a multiple of ThreadTile_N for proper thread mapping.
+    ///
+    /// @param input_strides   The stride configuration of the input tensor.
+    ///                        The last stride must be 1 to ensure contiguous memory access
+    ///                        and enable efficient vectorized loads.
+    ///
+    /// @return true if the arguments are supported, false otherwise.
+    ///         Error messages are logged when CK_TILE_LOGGING is enabled.
+    ///
+    /// @note Requirements:
+    ///       - y_continous_dim % ThreadTile_N == 0 (for proper thread distribution)
+    ///       - input_strides[-1] == 1 (for contiguous memory access)
+    CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim, auto input_strides)
+    {
+        using S = typename Problem::BlockShape;
+
+        if(y_continous_dim % S::ThreadTile_N != 0)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR("Total reduction size should be a multiple of ThreadTile_N!");
+            }
+            return false;
+        }
+
+        if(input_strides.at(number<input_strides.size() - 1>{}) != 1)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR(
+                    "Input tensor's last stride must be 1 to support correct vector access!");
+            }
+            return false;
+        }
+
+        return true;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp
similarity index 89%
rename from include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp
rename to include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp
index 3c547242d5..27bb4bcdcb 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp
+++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -9,7 +9,7 @@
 
 namespace ck_tile {
 
-struct BlockReduce2dDefaultPolicy
+struct Reduce2dDefaultPolicy
 {
     template <typename Problem>
     CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
@@ -18,8 +18,9 @@ struct BlockReduce2dDefaultPolicy
         return make_static_tile_distribution(
             tile_distribution_encoding<
                 sequence<>,
-                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::Vector_M>,
-                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<
+                    sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::ThreadTile_M>,
+                    sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::ThreadTile_N>>,
                 tuple<sequence<1, 2>, sequence<1, 2>>,
                 tuple<sequence<1, 1>, sequence<2, 2>>,
                 sequence<1, 1, 2, 2>,
diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp
new file mode 100644
index 0000000000..67fdec9286
--- /dev/null
+++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <typename XDataType_,
+          typename ComputeDataType_,
+          typename YDataType_,
+          typename BlockShape_,
+          typename ReduceOp_>
+struct Reduce2dProblem
+{
+    using XDataType       = remove_cvref_t<XDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using YDataType       = remove_cvref_t<YDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+    using ReduceOp        = ReduceOp_;
+
+    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
+    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp
new file mode 100644
index 0000000000..31eb1f2f4f
--- /dev/null
+++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <typename BlockWarps, // num warps along seq<M, N>
+          typename BlockTile,  // block size, seq<M, N>
+          typename WarpTile,   // warp size, seq<M, N>
+          typename ThreadTile> // contiguous pixels(vector size) along seq<M, N>
+struct Reduce2dShape
+{
+    static constexpr index_t Block_M = BlockTile::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile::at(number<1>{});
+
+    static constexpr index_t Warp_M = WarpTile::at(number<0>{});
+    static constexpr index_t Warp_N = WarpTile::at(number<1>{});
+
+    static constexpr index_t ThreadTile_M = ThreadTile::at(number<0>{});
+    static constexpr index_t ThreadTile_N = ThreadTile::at(number<1>{});
+
+    static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
+    static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
+
+    static constexpr index_t ThreadPerWarp_M = Warp_M / ThreadTile_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / ThreadTile_N;
+
+    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
+
+    static constexpr index_t BlockSize =
+        ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+};
+} // namespace ck_tile
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index f6983810be..bf7f1b4fa4 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -113,29 +113,30 @@ using GK_Tuple    = ck::Tuple<G_K>;
 using GK_GK_Tuple = ck::Tuple<G_K, G_K>;
 
 // pointwise functor
-using PassThrough         = ck::tensor_operation::element_wise::PassThrough;
-using Relu                = ck::tensor_operation::element_wise::Relu;
-using TanH                = ck::tensor_operation::element_wise::TanH;
-using Scale               = ck::tensor_operation::element_wise::Scale;
-using Bilinear            = ck::tensor_operation::element_wise::Bilinear;
-using AddAddFastGelu      = ck::tensor_operation::element_wise::AddAddFastGelu;
-using AddFastGelu         = ck::tensor_operation::element_wise::AddFastGelu;
-using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
-using AddRelu             = ck::tensor_operation::element_wise::AddRelu;
-using AddClamp            = ck::tensor_operation::element_wise::AddClamp;
-using Clamp               = ck::tensor_operation::element_wise::Clamp;
-using AddSilu             = ck::tensor_operation::element_wise::AddSilu;
-using AddReluAdd          = ck::tensor_operation::element_wise::AddReluAdd;
-using FastGelu            = ck::tensor_operation::element_wise::FastGelu;
-using MultiplyFastGelu    = ck::tensor_operation::element_wise::MultiplyFastGelu;
-using AddMultiply         = ck::tensor_operation::element_wise::AddMultiply;
-using MultiplyAdd         = ck::tensor_operation::element_wise::MultiplyAdd;
-using MultiplyMultiply    = ck::tensor_operation::element_wise::MultiplyMultiply;
-using ScaleAdd            = ck::tensor_operation::element_wise::ScaleAdd;
-using Gelu                = ck::tensor_operation::element_wise::Gelu;
-using Swish               = ck::tensor_operation::element_wise::Swish;
-using Add                 = ck::tensor_operation::element_wise::Add;
-using Multiply            = ck::tensor_operation::element_wise::Multiply;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using Relu                      = ck::tensor_operation::element_wise::Relu;
+using TanH                      = ck::tensor_operation::element_wise::TanH;
+using Scale                     = ck::tensor_operation::element_wise::Scale;
+using Bilinear                  = ck::tensor_operation::element_wise::Bilinear;
+using AddAddFastGelu            = ck::tensor_operation::element_wise::AddAddFastGelu;
+using AddFastGelu               = ck::tensor_operation::element_wise::AddFastGelu;
+using MultiplyAddFastGelu       = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
+using AddRelu                   = ck::tensor_operation::element_wise::AddRelu;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
+using AddSilu                   = ck::tensor_operation::element_wise::AddSilu;
+using AddReluAdd                = ck::tensor_operation::element_wise::AddReluAdd;
+using FastGelu                  = ck::tensor_operation::element_wise::FastGelu;
+using MultiplyFastGelu          = ck::tensor_operation::element_wise::MultiplyFastGelu;
+using AddMultiply               = ck::tensor_operation::element_wise::AddMultiply;
+using MultiplyAdd               = ck::tensor_operation::element_wise::MultiplyAdd;
+using MultiplyMultiply          = ck::tensor_operation::element_wise::MultiplyMultiply;
+using ScaleAdd                  = ck::tensor_operation::element_wise::ScaleAdd;
+using Gelu                      = ck::tensor_operation::element_wise::Gelu;
+using Swish                     = ck::tensor_operation::element_wise::Swish;
+using Add                       = ck::tensor_operation::element_wise::Add;
+using Multiply                  = ck::tensor_operation::element_wise::Multiply;
 
 template <typename Activation>
 using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Activation>;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index fca236d03e..bbc2a54c34 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>;
 
 using namespace ck::tensor_layout::convolution;
 
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
-using Clamp       = ck::tensor_operation::element_wise::Clamp;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
index c641019b70..768fcbada0 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>;
 
 using namespace ck::tensor_layout::convolution;
 
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
-using Clamp       = ck::tensor_operation::element_wise::Clamp;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
index 3e98852d58..5a4a011512 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp
@@ -24,9 +24,10 @@ using Empty_Tuple = ck::Tuple<>;
 
 using namespace ck::tensor_layout::convolution;
 
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
-using Clamp       = ck::tensor_operation::element_wise::Clamp;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
index 4e6b9c3d1d..57bdeddcf9 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp
@@ -32,9 +32,10 @@ using Empty_Tuple = ck::Tuple<>;
 
 using namespace ck::tensor_layout::convolution;
 
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
-using Clamp       = ck::tensor_operation::element_wise::Clamp;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
index 7ef78d46e2..d07d82e7ee 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
@@ -24,9 +24,10 @@ using Empty_Tuple = ck::Tuple<>;
 
 using namespace ck::tensor_layout::convolution;
 
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using AddClamp    = ck::tensor_operation::element_wise::AddClamp;
-using Clamp       = ck::tensor_operation::element_wise::Clamp;
+using PassThrough               = ck::tensor_operation::element_wise::PassThrough;
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using AddClamp                  = ck::tensor_operation::element_wise::AddClamp;
+using Clamp                     = ck::tensor_operation::element_wise::Clamp;
 
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp
new file mode 100644
index 0000000000..22cb7854a9
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+#ifdef CK_USE_XDL
+#include "grouped_convolution_forward_bias_bnorm_clamp_xdl.inc"
+#endif
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DLayouts,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename DDataTypes,
+          typename AComputeType,
+          typename BComputeType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DLayouts,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DDataTypes,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::BiasNormalizeInInferClamp,
+    AComputeType,
+    BComputeType>>
+{
+    using DeviceOp = DeviceGroupedConvFwdMultipleABD<
+        NumDimSpatial,
+        InLayout,
+        WeiLayout,
+        DLayouts,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        DDataTypes,
+        OutDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::BiasNormalizeInInferClamp,
+        AComputeType,
+        BComputeType>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_USE_XDL
+        // layout NHWGC/GKYXC/NHWGK
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
+        {
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
+                         is_same_v<BComputeType, half_t>)
+            {
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
+                         is_same_v<BComputeType, float>)
+            {
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+        // layout NDHWGC/GKZYXC/NDHWGK
+        if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
+                     is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
+        {
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP16
+            if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                         is_same_v<OutDataType, half_t> && is_same_v<AComputeType, half_t> &&
+                         is_same_v<BComputeType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+#ifdef CK_ENABLE_FP32
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float> && is_same_v<AComputeType, float> &&
+                         is_same_v<BComputeType, float>)
+            {
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
+        }
+#endif // CK_USE_XDL
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc
new file mode 100644
index 0000000000..b11b428471
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp_xdl.inc
@@ -0,0 +1,776 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_ENABLE_BF16
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        BF16,
+                                                        BF16,
+                                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        BF16,
+                                        BF16,
+                                        Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                        BF16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+#endif
+
+#ifdef CK_ENABLE_FP16
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F16,
+                                                        F16,
+                                                        Tuple<F16, F16, F16, F16, F16>,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F16,
+                                        F16,
+                                        Tuple<F16, F16, F16, F16, F16>,
+                                        F16,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+#endif
+
+#ifdef CK_ENABLE_FP32
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                        NHWGC,
+                                                        GKYXC,
+                                                        Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        F32,
+                                                        F32,
+                                                        Tuple<F32, F32, F32, F32, F32>,
+                                                        F32,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances(
+    std::vector<std::unique_ptr<
+        DeviceGroupedConvFwdMultipleABD<3,
+                                        NDHWGC,
+                                        GKZYXC,
+                                        Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                        NDHWGK,
+                                        F32,
+                                        F32,
+                                        Tuple<F32, F32, F32, F32, F32>,
+                                        F32,
+                                        PassThrough,
+                                        PassThrough,
+                                        BiasNormalizeInInferClamp>>>& instances);
+
+#endif
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 5204b51edf..1eaaa7e6ba 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -175,6 +175,10 @@ function(add_instance_library INSTANCE_NAME)
 
         target_compile_features(${INSTANCE_NAME} PUBLIC)
 
+        # splits debug information into separate .dwo files to reduce debug section size
+        if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
+            target_compile_options(${INSTANCE_NAME} PRIVATE -gsplit-dwarf)
+        endif()
         # flags to compress the library
         if(NOT DISABLE_OFFLOAD_COMPRESS AND NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
             message(DEBUG "Adding --offload-compress flag for ${INSTANCE_NAME}")
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
index e3e90c966d..3c332c3b22 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -90,10 +90,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are deprecated.  They may be removed in a future release."
     add_device_operation_instances(instances,
                                    device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances{});
     add_device_operation_instances(
         instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_bf16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
index 81e9122d95..aaaeda0312 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -83,10 +83,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(
         DeviceConvBwdData<1, NWC, KXC, NWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances{});
     add_device_operation_instances(
         instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
index dbc82168f4..331cc3c4b2 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -82,10 +82,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(
         DeviceConvBwdData<1, NWC, KXC, NWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
         instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances{});
     add_device_operation_instances(
         instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f32_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
index 3ac250f3e6..4e51074b3a 100644
--- a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -87,10 +87,16 @@ void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances{});
     add_device_operation_instances(
         instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_int8_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
index 6ca909c35e..58b3f8e37d 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
         instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
index d263e98851..a487f0a6f0 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances{});
     add_device_operation_instances(
         instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
index bc949e757c..cfd4f849b8 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -71,10 +71,16 @@ void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances{});
     add_device_operation_instances(
         instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
index 366d1fe160..c2f55d94eb 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -140,6 +140,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances{});
     add_device_operation_instances(
@@ -149,6 +151,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
     add_device_operation_instances(
         instances,
         device_conv_dedidecate_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
index 422e37e926..5df1c9cf39 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -142,6 +142,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
@@ -150,6 +152,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
index 5993f6bd7a..76ca976e37 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -139,6 +139,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances{});
     add_device_operation_instances(
@@ -147,6 +149,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances{});
     add_device_operation_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
index 2f079c234c..8221515caa 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -136,6 +136,8 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances{});
     add_device_operation_instances(
@@ -144,6 +146,10 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances{});
     add_device_operation_instances(
         instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
index 86c17aacf0..d7a82fdd2c 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -180,6 +180,8 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
         DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
@@ -200,6 +202,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
         add_device_operation_instances(
             instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{});
     }
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
index 63c612523f..153b770e1b 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -114,12 +114,18 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
                                               PassThrough,
                                               PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_bf16_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
index 0f3b9e7939..fd0c94250f 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -107,11 +107,17 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
         DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f16_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
index 14f9b5cd6a..038316ac31 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -106,11 +106,17 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
         DeviceConvFwd<2, NHWC, KYXC, NHWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
         instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f32_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
index 3f641cdadc..c77c8683c8 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -111,12 +111,18 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
                                               PassThrough,
                                               PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_int8_instances{});
     add_device_operation_instances(instances,
                                    device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
index 3402653e84..97830449ee 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -179,6 +179,8 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instanc
 void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(
     std::vector<DeviceConvFwdBiasActivationPtr<PassThrough, PassThrough, AddRelu>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(
         instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
@@ -203,6 +205,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(
             instances,
             device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{});
     }
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
index faac2813ba..e5c682d3cd 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -177,6 +177,8 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_ins
 void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances(
     std::vector<DeviceConvFwdBiasActivationAddPtr<PassThrough, PassThrough, AddReluAdd>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(
         instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances{});
     add_device_operation_instances(
@@ -204,6 +206,10 @@ void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instan
             instances,
             device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances_2x{});
     }
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
index 94b2a47e50..0b9a6c2b8d 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -90,10 +90,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances{});
     add_device_operation_instances(
         instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_bf16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
index 4244ab7b87..6c54552cc8 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -90,10 +90,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances{});
     add_device_operation_instances(
         instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f16_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
index 5c7db4ca3b..363e342c1b 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -89,10 +89,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances{});
     add_device_operation_instances(
         instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f32_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
index ebc56487a1..35bca49fed 100644
--- a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <cstdlib>
 
@@ -87,10 +87,16 @@ void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(
                                                   PassThrough,
                                                   PassThrough>>>& instances)
 {
+#if CK_BUILD_DEPRECATED
+#pragma message "These instances are getting deprecated"
     add_device_operation_instances(instances,
                                    device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances{});
     add_device_operation_instances(
         instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_int8_instances{});
+#else
+#pragma message "These instances were deprecated"
+    std::ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
index c8740e8d8c..0ffe5f95b2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_blockscale_wp/CMakeLists.txt
@@ -10,14 +10,14 @@ list(APPEND GEMM_BLOCKSCALE_WP_INSTANCES
 check_cxx_compiler_flag("-mllvm --misched-bottomup=1" HAS_MISCHED_BOTTOMUP)
 check_cxx_compiler_flag("-mllvm --misched-prera-direction=bottomup" HAS_MISCHED_PRERA_DIRECTION)
 if(HAS_MISCHED_BOTTOMUP)
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-bottomup=1")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-bottomup=1")
 elseif(HAS_MISCHED_PRERA_DIRECTION)
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
-        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--slp-threshold=-32;-mllvm;--misched-prera-direction=bottomup")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_comp_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup")
+        set_source_files_properties(device_gemm_blockscale_wp_xdl_f8_f8_bf16/device_gemm_blockscale_wp_xdl_f8_f8_bf16_mk_nk_mn_128_128_128_mem_v1_kpadding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1;-mllvm;--misched-prera-direction=bottomup")
 endif()
 add_instance_library(device_gemm_blockscale_wp_instance ${GEMM_BLOCKSCALE_WP_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt
new file mode 100644
index 0000000000..c06e4f5953
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/CMakeLists.txt
@@ -0,0 +1,240 @@
+# ONLY XDL_KERNELS
+set(GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP)
+include(ShardInstantiation)
+
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   # large tensor
+   # NHWGC, GKYXC, NHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in
+  NUM_SHARDS 2
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   # merged groups
+   # NHWGC, GKYXC, NHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   #mem
+   # NHWGC, GKYXC, NHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   # NHWGC, GKYXC, NHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   #comp
+   # NHWGC, GKYXC, NHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in
+  NUM_SHARDS 11
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in
+  NUM_SHARDS 5
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+
+add_instance_library(device_grouped_conv2d_fwd_bias_bnorm_clamp_instance ${GROUPED_CONV2D_FWD_BIAS_BNORM_CLAMP})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in
new file mode 100644
index 0000000000..51a12c33bd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_2x_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<2,
+                                                               NHWGC,
+                                                               GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in
new file mode 100644
index 0000000000..22ee546ac8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in
new file mode 100644
index 0000000000..632fee85a8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instances& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                  NHWGK,
+                                                                  ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<2,
+                                                                  NHWGC,
+                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                  NHWGK,
+                                                                  ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in
new file mode 100644
index 0000000000..50bbf761f1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_2x_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                              NHWGK,
+                                                              ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                              NHWGK,
+                                                              ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<2,
+                                                              NHWGC,
+                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                              NHWGK,
+                                                              ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in
new file mode 100644
index 0000000000..89baaff411
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                       NHWGC,
+                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                       NHWGK,
+                                                       ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_comp_instances<2,
+                                                       NHWGC,
+                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                       NHWGK,
+                                                       ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in
new file mode 100644
index 0000000000..80a2655de6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instances& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                 NHWGK,
+                                                                 ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<2,
+                                                                 NHWGC,
+                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                 NHWGK,
+                                                                 ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in
new file mode 100644
index 0000000000..395885d03d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                       NHWGC,
+                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                       NHWGK,
+                                                       ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_comp_instances<2,
+                                                       NHWGC,
+                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                       NHWGK,
+                                                       ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in
new file mode 100644
index 0000000000..097254dc34
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NHWGC,
+                                                         GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                         NHWGK,
+                                                         ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NHWGC,
+                                                         GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                         NHWGK,
+                                                         ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<2,
+                                                         NHWGC,
+                                                         GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                         NHWGK,
+                                                         ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in
new file mode 100644
index 0000000000..7844440dd0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                              NHWGK,
+                                                                              ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                              NHWGK,
+                                                                              ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                                                              NHWGC,
+                                                                              GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                              NHWGK,
+                                                                              ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in
new file mode 100644
index 0000000000..9db1750e8e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in
new file mode 100644
index 0000000000..341fdf6eb6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in
new file mode 100644
index 0000000000..bcb126392a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                                                   NHWGC,
+                                                                                   GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                   NHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<2,
+                                                        NHWGC,
+                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                        NHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in
new file mode 100644
index 0000000000..4e3a435e74
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<2,
+                                                                             NHWGC,
+                                                                             GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                             NHWGK,
+                                                                             ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in
new file mode 100644
index 0000000000..0956d9dd71
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_bf16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in
new file mode 100644
index 0000000000..b836dd8374
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<2,
+                                                               NHWGC,
+                                                               GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in
new file mode 100644
index 0000000000..6b8cbf1704
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_large_tensor_nhwgc_gkyxc_nhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_f32_instances<2,
+                                                               NHWGC,
+                                                               GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                               NHWGK,
+                                                               ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in
new file mode 100644
index 0000000000..a2c36ee52b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in
new file mode 100644
index 0000000000..1c12ae66a3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in
new file mode 100644
index 0000000000..4fde5e662c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in
new file mode 100644
index 0000000000..d75c7f70d5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in
new file mode 100644
index 0000000000..d51b3d01e3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in
new file mode 100644
index 0000000000..47135a2dd7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<2,
+                                                                                 NHWGC,
+                                                                                 GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                                 NHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in
new file mode 100644
index 0000000000..3e08e9668f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instance.in
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2,
+                                                                        NHWGC,
+                                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                        NHWGK,
+                                                                        ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<2,
+                                                                        NHWGC,
+                                                                        GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                        NHWGK,
+                                                                        ConvFwd3x3,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                     NHWGK,
+                                                                     ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<2,
+                                                                     NHWGC,
+                                                                     GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                     NHWGK,
+                                                                     ConvFwd3x3,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in
new file mode 100644
index 0000000000..ec76a8e1d1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instance.in
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f16_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2,
+                                                                       NHWGC,
+                                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                       NHWGK,
+                                                                       ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<2,
+                                                                       NHWGC,
+                                                                       GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                       NHWGK,
+                                                                       ConvFwd3x3,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                    NHWGK,
+                                                                    ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                    NHWGK,
+                                                                    ConvFwd3x3,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in
new file mode 100644
index 0000000000..2bbac89bbe
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.in
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances_shard(device_grouped_conv2d_fwd_bias_bn_clamp_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<2,
+                                                                NHWGC,
+                                                                GKYXC, Tuple<NHWGK, NHWGK, NHWGK, NHWGK, NHWGK>,
+                                                                NHWGK,
+                                                                ConvFwd3x3,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt
new file mode 100644
index 0000000000..bda9149227
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/CMakeLists.txt
@@ -0,0 +1,240 @@
+# ONLY XDL_KERNELS
+set(GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP)
+include(ShardInstantiation)
+
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+   # large tensor
+   # NDHWGC, GKZYXC, NDHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances
+  TEMPLATE_FILE xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
+  NUM_SHARDS 2
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/large_tensor
+)
+   # merged groups
+   # NDHWGC, GKZYXC, NDHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances
+  TEMPLATE_FILE xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
+  NUM_SHARDS 3
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/merged_groups
+)
+   #mem
+   # NDHWGC, GKZYXC, NDHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   # NDHWGC, GKZYXC, NDHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in
+  NUM_SHARDS 20
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+   #comp
+   # NDHWGC, GKZYXC, NDHWGK
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
+  NUM_SHARDS 11
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in
+  NUM_SHARDS 4
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in
+  NUM_SHARDS 1
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in
+  NUM_SHARDS 5
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+   
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+
+add_instance_library(device_grouped_conv3d_fwd_bias_bnorm_clamp_instance ${GROUPED_conv3d_FWD_BIAS_BNORM_CLAMP})
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in
new file mode 100644
index 0000000000..f397f0a810
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_2x_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
new file mode 100644
index 0000000000..d6aa4ea964
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in
new file mode 100644
index 0000000000..7c993f8b94
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_part2_instances& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                  NDHWGK,
+                                                                  ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in
new file mode 100644
index 0000000000..fb41ec60f8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_2x_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                              NDHWGK,
+                                                              ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                              NDHWGK,
+                                                              ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
new file mode 100644
index 0000000000..e1d581e4fd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                       NDHWGK,
+                                                       ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                       NDHWGK,
+                                                       ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in
new file mode 100644
index 0000000000..99b48d51a0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instance.in
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_part2_instances& instances)
+{
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                 NDHWGK,
+                                                                 ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in
new file mode 100644
index 0000000000..b172975635
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/comp/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                       NDHWGK,
+                                                       ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                       NDHWGK,
+                                                       ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in
new file mode 100644
index 0000000000..8ec8d9248f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NDHWGC,
+                                                         GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                         NDHWGK,
+                                                         ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NDHWGC,
+                                                         GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                         NDHWGK,
+                                                         ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_bf16_16x16_instances<3,
+                                                         NDHWGC,
+                                                         GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                         NDHWGK,
+                                                         ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
new file mode 100644
index 0000000000..fb5c4159fd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                              NDHWGK,
+                                                                              ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                              NDHWGK,
+                                                                              ConvFwd1x1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NDHWGC,
+                                                                              GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                              NDHWGK,
+                                                                              ConvFwd1x1S1P0,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in
new file mode 100644
index 0000000000..a00fbf5342
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f16_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
new file mode 100644
index 0000000000..222ec0c2e0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1S1P0,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in
new file mode 100644
index 0000000000..8fbedb7793
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.in
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_f32_16x16_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                        NDHWGK,
+                                                        ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
new file mode 100644
index 0000000000..c538d50fc9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_instances<3,
+                                                                             NDHWGC,
+                                                                             GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                             NDHWGK,
+                                                                             ConvFwd1x1S1P0,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
new file mode 100644
index 0000000000..be76a48480
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_bf16_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
new file mode 100644
index 0000000000..dcfdb984c2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
new file mode 100644
index 0000000000..ed1988cdf4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/large_tensor/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_large_tensor_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_large_tensor_f32_instances<3,
+                                                               NDHWGC,
+                                                               GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                               NDHWGK,
+                                                               ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in
new file mode 100644
index 0000000000..83af7e09ce
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in
new file mode 100644
index 0000000000..ce83cb566a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in
new file mode 100644
index 0000000000..051aaf7cf3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in
new file mode 100644
index 0000000000..6fa3709cc6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in
new file mode 100644
index 0000000000..2ba3e4ec93
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in
new file mode 100644
index 0000000000..c4d33236af
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/mem/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instances& instances)
+{
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(instances, ck::util::filter_tuple_by_modulo_t<
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NDHWGC,
+                                                                                 GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                                 NDHWGK,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
new file mode 100644
index 0000000000..6a902ed72d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instance.in
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Tuple<BF16, BF16, BF16, BF16, BF16>,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_bf16_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<3,
+                                                                        NDHWGC,
+                                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                        NDHWGK,
+                                                                        ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances_2x<3,
+                                                                        NDHWGC,
+                                                                        GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                        NDHWGK,
+                                                                        ConvFwd3x3,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                     NDHWGK,
+                                                                     ConvFwdDefault,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_bf16_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                     NDHWGK,
+                                                                     ConvFwd3x3,Tuple<BF16, BF16, BF16, BF16, BF16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
new file mode 100644
index 0000000000..b8125423bc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instance.in
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Tuple<F16, F16, F16, F16, F16>,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f16_instances& instances)
+{
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<3,
+                                                                       NDHWGC,
+                                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                       NDHWGK,
+                                                                       ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances_2x<3,
+                                                                       NDHWGC,
+                                                                       GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                       NDHWGK,
+                                                                       ConvFwd3x3,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+    else
+    {
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                    NDHWGK,
+                                                                    ConvFwdDefault,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+        add_device_operation_instances(
+            instances, ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_merged_groups_f16_instances<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                    NDHWGK,
+                                                                    ConvFwd3x3,Tuple<F16, F16, F16, F16, F16>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
new file mode 100644
index 0000000000..f292d95cda
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_bias_bnorm_clamp/xdl/merged_groups/device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instance.in
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances = std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                F32,
+                                                                F32,
+                                                                Tuple<F32, F32, F32, F32, F32>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                BiasNormalizeInInferClamp>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances_shard(device_grouped_conv3d_fwd_bias_bn_clamp_xdl_merged_groups_ndhwgc_gkzyxc_ndhwgk_f32_instances& instances)
+{
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwdDefault,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances, ck::util::filter_tuple_by_modulo_t<
+        device_grouped_conv_fwd_xdl_merged_groups_f32_instances<3,
+                                                                NDHWGC,
+                                                                GKZYXC, Tuple<NDHWGK, NDHWGK, NDHWGK, NDHWGK, NDHWGK>,
+                                                                NDHWGK,
+                                                                ConvFwd3x3,Tuple<F32, F32, F32, F32, F32>, BiasNormalizeInInferClamp>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
new file mode 100644
index 0000000000..cd6c141219
--- /dev/null
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp
@@ -0,0 +1,427 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp"
+
+namespace ck {
+namespace profiler {
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+using Clamp        = ck::tensor_operation::element_wise::Clamp;
+using Add          = ck::tensor_operation::element_wise::Add;
+
+// NOTE: Usage of NHWGK layout for GK bias is a workaround. This test is to
+// just keep such implementation valid.
+// TODO: Add possiblity to pass GK layout and GK lengths for bias and reuse
+// the same instances.
+
+template <ck::index_t NDimSpatial>
+auto get_elementwise_desc(ck::index_t G, ck::index_t K)
+{
+    if constexpr(NDimSpatial == 1)
+    {
+        return HostTensorDescriptor({G, 1, K, 1}, {K, 0, 1, 0});
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        return HostTensorDescriptor({G, 1, K, 1, 1}, {K, 0, 1, 0, 0});
+    }
+    else
+    {
+        return HostTensorDescriptor({G, 1, K, 1, 1, 1}, {K, 0, 1, 0, 0, 0});
+    }
+}
+
+template <ck::index_t NDimSpatial, typename OutDataType>
+void ref_bnorm_clamp_infer(Tensor<OutDataType>& out,
+                           Tensor<OutDataType>& in,
+                           Tensor<OutDataType>& mean,
+                           Tensor<OutDataType>& variance,
+                           Tensor<OutDataType>& scale,
+                           Tensor<OutDataType>& shift,
+                           const float floor,
+                           const float ceil,
+                           const float epsilon)
+{
+
+    auto func = [&](auto... idxs) {
+        const float x = type_convert<float>(in(idxs...));
+
+        const float invVariance =
+            type_convert<float>(1.0f) / std::sqrt(epsilon + type_convert<float>(variance(idxs...)));
+
+        const float norm_x = (x - type_convert<float>(mean(idxs...))) * invVariance;
+
+        float y =
+            type_convert<float>(scale(idxs...)) * norm_x + type_convert<float>(shift(idxs...));
+
+        Clamp{floor, ceil}(y, y);
+
+        out(idxs...) = type_convert<OutDataType>(y);
+    };
+    if constexpr(NDimSpatial == 1)
+    {
+        make_ParallelTensorFunctor(func,
+                                   out.GetLengths()[0],
+                                   out.GetLengths()[1],
+                                   out.GetLengths()[2],
+                                   out.GetLengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if constexpr(NDimSpatial == 2)
+    {
+        make_ParallelTensorFunctor(func,
+                                   out.GetLengths()[0],
+                                   out.GetLengths()[1],
+                                   out.GetLengths()[2],
+                                   out.GetLengths()[3],
+                                   out.GetLengths()[4])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        make_ParallelTensorFunctor(func,
+                                   out.GetLengths()[0],
+                                   out.GetLengths()[1],
+                                   out.GetLengths()[2],
+                                   out.GetLengths()[3],
+                                   out.GetLengths()[4],
+                                   out.GetLengths()[5])(std::thread::hardware_concurrency());
+    }
+}
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AComputeType = InDataType,
+          typename BComputeType = AComputeType,
+          typename IndexType    = ck::index_t,
+          bool ElementwiseGK    = false>
+bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
+                                              int init_method,
+                                              bool do_log,
+                                              bool time_kernel,
+                                              const ck::utils::conv::ConvParam& conv_param)
+{
+    const float floor   = 0.f;
+    const float ceil    = 2048.f;
+    const float epsilon = 1e-4;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{floor, ceil, epsilon};
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    const index_t G = conv_param.G_;
+    const index_t K = conv_param.K_;
+
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<IndexType, NDimSpatial + 3> d_g_n_k_wos_strides{};
+    std::array<IndexType, NDimSpatial> conv_filter_strides{};
+    std::array<IndexType, NDimSpatial> conv_filter_dilations{};
+    std::array<IndexType, NDimSpatial> input_left_pads{};
+    std::array<IndexType, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    Tensor<InDataType> input(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
+    Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
+    const auto elementwise_desc =
+        ElementwiseGK ? get_elementwise_desc<NDimSpatial>(G, K) : out_g_n_k_wos_desc;
+
+    Tensor<OutDataType> bias(elementwise_desc);
+    Tensor<OutDataType> mean(elementwise_desc);
+    Tensor<OutDataType> variance(elementwise_desc);
+    Tensor<OutDataType> scale(elementwise_desc);
+    Tensor<OutDataType> shift(elementwise_desc);
+
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << host_output.mDesc << std::endl;
+
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "mean: " << mean.mDesc << std::endl;
+    std::cout << "variance: " << variance.mDesc << std::endl;
+    std::cout << "scale: " << scale.mDesc << std::endl;
+    std::cout << "shift: " << shift.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+
+        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        mean.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        variance.GenerateTensorValue(GeneratorTensor_2<OutDataType>{0, 5});
+        scale.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        shift.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+
+        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        mean.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        variance.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0, 0.5});
+        scale.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        shift.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
+
+    const std::size_t elementwise_dev_buf_size =
+        ElementwiseGK ? sizeof(OutDataType) * G * K
+                      : sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize();
+    DeviceMem bias_device_buf(elementwise_dev_buf_size);
+    DeviceMem mean_device_buf(elementwise_dev_buf_size);
+    DeviceMem variance_device_buf(elementwise_dev_buf_size);
+    DeviceMem scale_device_buf(elementwise_dev_buf_size);
+    DeviceMem shift_device_buf(elementwise_dev_buf_size);
+
+    in_device_buf.ToDevice(input.mData.data());
+    wei_device_buf.ToDevice(weight.mData.data());
+
+    bias_device_buf.ToDevice(bias.mData.data());
+    mean_device_buf.ToDevice(mean.mData.data());
+    variance_device_buf.ToDevice(variance.mData.data());
+    scale_device_buf.ToDevice(scale.mData.data());
+    shift_device_buf.ToDevice(shift.mData.data());
+
+    if constexpr(ElementwiseGK)
+    {
+        constexpr ck::index_t spatial_offset = 3;
+        d_g_n_k_wos_strides[1]               = 0;
+        for(int i = 0; i < NDimSpatial; i++)
+        {
+            d_g_n_k_wos_strides[i + spatial_offset] = 0;
+        }
+    }
+
+    // run reference op
+    if(do_verification)
+    {
+        // Run Conv and Bnorm seperatly
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     Add,
+                                                                     0,
+                                                                     0,
+                                                                     1>{};
+
+        std::array<Tensor<OutDataType>, 1> d_tensors = {bias};
+        auto ref_conv_invoker                        = ref_conv.MakeInvoker();
+        auto ref_conv_argument                       = ref_conv.MakeArgument(input,
+                                                       weight,
+                                                       host_output,
+                                                       conv_param.conv_filter_strides_,
+                                                       conv_param.conv_filter_dilations_,
+                                                       conv_param.input_left_pads_,
+                                                       conv_param.input_right_pads_,
+                                                       in_element_op,
+                                                       wei_element_op,
+                                                       Add{},
+                                                                             {},
+                                                                             {},
+                                                       d_tensors);
+
+        // init host output to zero
+        host_output.SetZero();
+        ref_conv_invoker.Run(ref_conv_argument);
+        ref_bnorm_clamp_infer<NDimSpatial>(
+            host_output, host_output, mean, variance, scale, shift, floor, ceil, epsilon);
+    }
+
+    std::string best_op_name;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    bool pass = true;
+
+    auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
+        // workspace_sz will be equal to 0 for other layout than NGCHW
+        const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_dev(workspace_sz);
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init output to zero before profiling next kernel
+            out_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = conv_param.GetFlops();
+            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+            float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
+
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(device_output.mData.data());
+
+                pass = pass & ck::utils::check_err(device_output, host_output);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "host_output  : ", host_output.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    };
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<OutLayout, OutLayout, OutLayout, OutLayout, OutLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        ck::Tuple<OutDataType, OutDataType, OutDataType, OutDataType, OutDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        AComputeType,
+        BComputeType>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;
+
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                        wei_device_buf.GetDeviceBuffer(),
+                                                        {bias_device_buf.GetDeviceBuffer(),
+                                                         mean_device_buf.GetDeviceBuffer(),
+                                                         variance_device_buf.GetDeviceBuffer(),
+                                                         scale_device_buf.GetDeviceBuffer(),
+                                                         shift_device_buf.GetDeviceBuffer()},
+                                                        out_device_buf.GetDeviceBuffer(),
+                                                        a_g_n_c_wis_lengths,
+                                                        a_g_n_c_wis_strides,
+                                                        b_g_k_c_xs_lengths,
+                                                        b_g_k_c_xs_strides,
+                                                        {e_g_n_k_wos_lengths,
+                                                         e_g_n_k_wos_lengths,
+                                                         e_g_n_k_wos_lengths,
+                                                         e_g_n_k_wos_lengths,
+                                                         e_g_n_k_wos_lengths},
+                                                        {d_g_n_k_wos_strides,
+                                                         d_g_n_k_wos_strides,
+                                                         d_g_n_k_wos_strides,
+                                                         d_g_n_k_wos_strides,
+                                                         d_g_n_k_wos_strides},
+                                                        e_g_n_k_wos_lengths,
+                                                        e_g_n_k_wos_strides,
+                                                        conv_filter_strides,
+                                                        conv_filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        in_element_op,
+                                                        wei_element_op,
+                                                        out_element_op);
+
+        run_impl(op_ptr, argument_ptr);
+    }
+
+    std::cout << "Best configuration parameters:" << "\nname: " << best_op_name
+              << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
+              << "\nGB/s: " << best_gb_per_sec << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh
index a770970fef..ea2834ae62 100755
--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
@@ -1,2 +1,7 @@
+#!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'
+
+
 find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
 git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|.hpp|.inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index c45bb4330d..25a1590808 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'
+
 rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles
diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh
index 311ea91822..5263de92c8 100755
--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'
+
 rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 42605f2513..374e5b4990 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -21,3 +21,5 @@ add_subdirectory(add_rmsnorm2d_rdquant)
 # add_subdirectory(layernorm2d)
 # add_subdirectory(rmsnorm2d)
 add_subdirectory(gemm_block_scale)
+add_subdirectory(utility)
+add_subdirectory(reduce)
diff --git a/test/ck_tile/batched_transpose/test_batched_transpose.cpp b/test/ck_tile/batched_transpose/test_batched_transpose.cpp
index cce00e27cb..77d5825eed 100644
--- a/test/ck_tile/batched_transpose/test_batched_transpose.cpp
+++ b/test/ck_tile/batched_transpose/test_batched_transpose.cpp
@@ -95,10 +95,12 @@ class TestCkTileBatchedTranspose //              N    C    H    W    layout_in==
         ck_tile::HostTensor<DataType> y_ref(Y_dim, Y_stride);
 
         ck_tile::FillUniformDistribution<DataType>{-.5f, .5f}(x_host);
+        ck_tile::FillConstant<DataType>{-37}(y_host);
 
         ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
         ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes());
         x_dev.ToDevice(x_host.data());
+        y_dev.ToDevice(y_host.data());
 
         using Kernel = typename Config::Kernel;
 
@@ -131,8 +133,8 @@ class TestCkTileBatchedTranspose //              N    C    H    W    layout_in==
                                                                  height,
                                                                  width,
                                                                  height * width,
-                                                                 Config::BlockTile::at(1),
-                                                                 Config::BlockTile::at(0)};
+                                                                 Config::BlockTile::at(0),
+                                                                 Config::BlockTile::at(1)};
         auto kargs           = Kernel::MakeKargs(host_args);
 
         auto sc                   = ck_tile::stream_config{};
@@ -140,15 +142,24 @@ class TestCkTileBatchedTranspose //              N    C    H    W    layout_in==
         constexpr dim3 block_size = Kernel::BlockSize();
         ck_tile::launch_kernel(
             sc, ck_tile::make_kernel<block_size.x, 1>(Kernel{}, grid_size, block_size, 0, kargs));
+
         y_dev.FromDevice(y_host.data());
         ck_tile::reference_batched_transpose<DataType>(x_host, y_ref, layout_in, layout_out);
 
         std::ostringstream message;
         message << "N=" << N << " C=" << C << " H=" << H << " W=" << W << " layout_in=" << layout_in
-                << " layout_out=" << layout_out << " device_name=" << device_name;
+                << " layout_out=" << layout_out << " grid_size={" << grid_size.x << ", "
+                << grid_size.y << ", " << grid_size.z << "} block_size=" << block_size.x
+                << " device_name=" << device_name;
 
+        // NB: order of output and reference matters
         bool pass = ck_tile::check_err(
-            y_ref, y_host, message.str(), /* rtol */ 0, /* atol */ 0, /* allow inf */ false);
+            /* out */ y_host,
+            /* ref */ y_ref,
+            message.str(),
+            /* rtol */ 0,
+            /* atol */ 0,
+            /* allow inf */ false);
 
         EXPECT_TRUE(pass);
     }
@@ -160,14 +171,16 @@ static const auto kTestingValues = ::testing::Values(
 //             N  C   H  W   layout_in==NCHW    
     std::tuple{1, 32, 1, 32, true},
     std::tuple{1, 64, 1, 64, true},
+    std::tuple{1, 32, 1, 64, true},
+    std::tuple{1, 64, 1, 32, true},
     std::tuple{2, 12, 1, 32, false},
     std::tuple{3, 1334, 1, 37, false},
     std::tuple{4, 27, 1, 32, true},
     std::tuple{5, 1234, 1, 12, true},
     std::tuple{1, 1, 1, 1, true},
     std::tuple{1, 1, 1, 1, false},
-    std::tuple{128, 1024, 64, 64, true},
-    std::tuple{128, 1024, 64, 64, false},
+    std::tuple{17, 1024, 64, 64, true},
+    std::tuple{17, 1024, 64, 64, false},
     std::tuple{16, 64, 32, 128, true},
     std::tuple{16, 64, 128, 32, false},
     std::tuple{1, 2048, 1, 1, true},
@@ -239,6 +252,60 @@ class CaseHalfPadMultiWarpLoadTranspose
 {
 };
 
+class CaseHalfPadMultiWarp128MNLoadTranspose
+    : public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t,
+                                                       PipelineTag::LDSLoadTranspose,
+                                                       128,
+                                                       128,
+                                                       2,
+                                                       2,
+                                                       false,
+                                                       false>>
+{
+};
+
+class CaseHalfPadMultiWarp128MN
+    : public TestCkTileBatchedTranspose<
+          PipelineConfig<ck_tile::half_t, PipelineTag::Universal, 128, 128, 2, 2, false, false>>
+{
+};
+
+class CaseHalfPadRectTile1
+    : public TestCkTileBatchedTranspose<
+          PipelineConfig<ck_tile::half_t, PipelineTag::Universal, 32, 64, 1, 1, false, false>>
+{
+};
+
+class CaseHalfPadRectTile2
+    : public TestCkTileBatchedTranspose<
+          PipelineConfig<ck_tile::half_t, PipelineTag::Universal, 64, 32, 1, 1, false, false>>
+{
+};
+
+class CaseHalfPadRectTile1LoadTranspose
+    : public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t,
+                                                       PipelineTag::LDSLoadTranspose,
+                                                       32,
+                                                       64,
+                                                       1,
+                                                       1,
+                                                       false,
+                                                       false>>
+{
+};
+
+class CaseHalfPadRectTile2LoadTranspose
+    : public TestCkTileBatchedTranspose<PipelineConfig<ck_tile::half_t,
+                                                       PipelineTag::LDSLoadTranspose,
+                                                       64,
+                                                       32,
+                                                       1,
+                                                       1,
+                                                       false,
+                                                       false>>
+{
+};
+
 TEST_P(CaseHalf, TestCorrectness) { this->Run(GetParam()); }
 TEST_P(CaseByte, TestCorrectness) { this->Run(GetParam()); }
 TEST_P(CaseWord, TestCorrectness) { this->Run(GetParam()); }
@@ -248,6 +315,12 @@ TEST_P(CaseHalfPad, TestCorrectness) { this->Run(GetParam()); }
 TEST_P(CaseHalfPadLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
 TEST_P(CaseHalfPadMultiWarp, TestCorrectness) { this->Run(GetParam()); }
 TEST_P(CaseHalfPadMultiWarpLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadMultiWarp128MN, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadMultiWarp128MNLoadTranspose, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadRectTile1, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadRectTile1LoadTranspose, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadRectTile2, TestCorrectness) { this->Run(GetParam()); }
+TEST_P(CaseHalfPadRectTile2LoadTranspose, TestCorrectness) { this->Run(GetParam()); }
 
 // clang-format off
 INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalf, kTestingValues);
@@ -259,4 +332,11 @@ INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPad, kTestingV
 INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadLoadTranspose, kTestingValues);
 INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp, kTestingValues);
 INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarpLoadTranspose, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp128MN, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadMultiWarp128MNLoadTranspose, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile1, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile1LoadTranspose, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile2, kTestingValues);
+INSTANTIATE_TEST_SUITE_P(TestCkTileBatchedTransposeSuite, CaseHalfPadRectTile2LoadTranspose, kTestingValues);
+
 // clang-format on
diff --git a/test/ck_tile/data_type/CMakeLists.txt b/test/ck_tile/data_type/CMakeLists.txt
index a9ce48d1de..384fd3c1c4 100644
--- a/test/ck_tile/data_type/CMakeLists.txt
+++ b/test/ck_tile/data_type/CMakeLists.txt
@@ -3,11 +3,13 @@ if(GPU_TARGETS MATCHES "gfx9")
 endif()
 if(GPU_TARGETS MATCHES "gfx95")
     add_gtest_executable(test_ck_tile_pk_fp4 test_pk_fp4.cpp)
+    add_gtest_executable(test_ck_tile_mx_scale test_mx_scale.cpp)
 endif()
 
 if(CK_USE_OCP_FP8 OR CK_USE_FNUZ_FP8)
     add_gtest_executable(test_ck_tile_fp8 test_fp8.cpp)
     target_compile_options(test_ck_tile_fp8 PRIVATE -Wno-float-equal)
+    target_compile_definitions(test_ck_tile_fp8 PUBLIC GTEST_HAS_RTTI=0)
     # conditionally specify the use of OCP_FP8
     if(CK_USE_OCP_FP8)
         target_compile_options(test_ck_tile_fp8 PRIVATE -DCK_TILE_USE_OCP_FP8)
diff --git a/test/ck_tile/data_type/test_mx_scale.cpp b/test/ck_tile/data_type/test_mx_scale.cpp
new file mode 100644
index 0000000000..7a024d238f
--- /dev/null
+++ b/test/ck_tile/data_type/test_mx_scale.cpp
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include <hip/hip_runtime.h>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+using ck_tile::bf16_t;
+using ck_tile::bf16x2_t;
+using ck_tile::fp16_t;
+using ck_tile::fp16x2_t;
+using ck_tile::fp32_t;
+using ck_tile::fp32x2_t;
+using ck_tile::number;
+using ck_tile::pk_fp4_t;
+
+template <typename SRC, typename DST, bool is_device>
+CK_TILE_HOST void test_convert();
+
+using ck_tile::e8m0_raw_t;
+using ck_tile::e8m0_t;
+
+TEST(OCP_Scale, NumericLimits)
+{
+    EXPECT_EQ(ck_tile::numeric<e8m0_t>::has_inf(), false);
+    EXPECT_EQ(ck_tile::numeric<e8m0_t>::zero(), ck_tile::numeric<e8m0_t>::signaling_NaN());
+    EXPECT_EQ(ck_tile::numeric<e8m0_t>::min(), e8m0_t{e8m0_raw_t{0b00000000}});
+    EXPECT_EQ(ck_tile::numeric<e8m0_t>::max(), e8m0_t{e8m0_raw_t{0b11111110}});
+}
+TEST(OCP_Scale, NumericBasic)
+{
+    auto scale_1 = e8m0_t{1.0f};
+    auto scale_2 = e8m0_t{e8m0_raw_t{ck_tile::numeric_traits<e8m0_t>::bias}}; // 2^0
+    EXPECT_EQ(scale_1, scale_2);
+
+    auto scale_3 = e8m0_t{8.0f};
+    auto scale_4 = e8m0_t{e8m0_raw_t{3 + ck_tile::numeric_traits<e8m0_t>::bias}}; // 2^3
+    EXPECT_EQ(scale_3, scale_4);
+}
+
+TEST(OCP_Scale, ScaledConvertDevice)
+{
+    constexpr bool is_device = true;
+    test_convert<fp32_t, fp32_t, is_device>(); // fp32 -> fp4 -> fp32
+    test_convert<fp16_t, fp16_t, is_device>();
+    test_convert<bf16_t, bf16_t, is_device>();
+    test_convert<fp32_t, fp16_t, is_device>();
+    test_convert<fp32_t, bf16_t, is_device>();
+    test_convert<fp16_t, fp32_t, is_device>();
+    test_convert<bf16_t, fp32_t, is_device>();
+}
+TEST(OCP_Scale, ScaledConvertHost)
+{
+    constexpr bool is_device = false;
+    test_convert<fp32_t, fp32_t, is_device>(); // fp32 -> fp4 -> fp32
+    test_convert<fp16_t, fp16_t, is_device>();
+    test_convert<bf16_t, bf16_t, is_device>();
+    test_convert<fp32_t, fp16_t, is_device>();
+    test_convert<fp32_t, bf16_t, is_device>();
+    test_convert<fp16_t, fp32_t, is_device>();
+    test_convert<bf16_t, fp32_t, is_device>();
+}
+TEST(OCP_Scale, tensorInit)
+{
+    using scale_t = e8m0_t;
+    ck_tile::HostTensor<scale_t> scales({10, 10});
+    ck_tile::FillUniformDistribution<scale_t>{1.f, 1.f}(scales);
+    scales.SetZero();
+}
+
+#define toPF4(x, y) ck_tile::scaled_type_convert<pk_fp4_t>(x, y)
+#define toDST(x, y) ck_tile::scaled_type_convert<DST>(x, y)
+#define toDSTx2(x, y) ck_tile::scaled_type_convert<DSTx2_t>(x, y)
+
+#define toF32(x) ck_tile::type_convert<float>(x)
+#define toPF4_(x) ck_tile::type_convert<pk_fp4_t>(x)
+#define toSRC(x) ck_tile::type_convert<SRC>(x)
+#define toDST_(x) ck_tile::type_convert<DST>(x)
+
+template <typename Kernel, typename... Args>
+__global__ void MyKernel(Args... args)
+{
+    Kernel{}(args...);
+}
+template <typename SRC, typename DST, int N>
+struct SrcPkfp4Dst
+{
+    CK_TILE_HOST_DEVICE void
+    operator()(const SRC* src, DST* dst, e8m0_t scale1, e8m0_t scale2) const
+    {
+
+        using SRCx2_t = ck_tile::ext_vector_t<SRC, 2>;
+        using DSTx2_t = ck_tile::ext_vector_t<DST, 2>;
+
+        ck_tile::static_for<0, N, 2>{}([&](auto i) {
+            const auto input2 = SRCx2_t{src[i], src[i + 1]};
+
+            if(i % 4 == 0)
+            {
+                // ex: fp32_t -> fp4 -> bf16_t
+                dst[i] = toDST(toPF4(src[i], scale1), scale2);
+                // ex: fp32x2_t -> pk_fp4 -> unpack<0> -> bf16_t
+                dst[i + 1] = toDST(toPF4_(toPF4(input2, scale1).unpack(number<1>{})), scale2);
+            }
+            else
+            {
+                // ex: fp32x2_t -> pk_fp4_t -> bf16x2_t
+                reinterpret_cast<DSTx2_t*>(dst)[i >> 1] = toDSTx2(toPF4(input2, scale1), scale2);
+            }
+        });
+    }
+};
+
+template <typename SRC, typename DST, bool is_device>
+CK_TILE_HOST void test_convert()
+{
+    const auto test_data = std::array{4.f, 6.f, 8.f, 10.f};
+    const auto ref_data  = std::array{8.f, 16.f, 16.f, 16.f};
+    const auto scale1    = e8m0_t{8.0f};
+    const auto scale2    = e8m0_t{16.0f};
+
+    static_assert(test_data.size() == ref_data.size());
+    static_assert(test_data.size() % 2 == 0);
+
+    constexpr int N = test_data.size();
+    std::array<SRC, N> in;
+    std::array<DST, N> ref, out;
+
+    // prepare input and ground truth in host
+    for(int i = 0; i < N; ++i)
+    {
+        in[i]  = toSRC(test_data[i]);
+        ref[i] = toDST_(ref_data[i]);
+        EXPECT_EQ(test_data[i], toF32(in[i]));
+        EXPECT_EQ(ref_data[i], toF32(ref[i]));
+    }
+
+    using job = SrcPkfp4Dst<SRC, DST, N>;
+
+    if constexpr(is_device)
+    {
+        auto in_d  = std::make_unique<ck_tile::DeviceMem>(in.size() * sizeof(SRC));
+        auto out_d = std::make_unique<ck_tile::DeviceMem>(out.size() * sizeof(DST));
+        in_d->ToDevice(in.data());
+
+        MyKernel<job><<<1, 1>>>(reinterpret_cast<const SRC*>(in_d->GetDeviceBuffer()),
+                                reinterpret_cast<DST*>(out_d->GetDeviceBuffer()),
+                                scale1,
+                                scale2);
+
+        out_d->FromDevice(out.data());
+    }
+    else
+    {
+        job{}(in.data(), out.data(), scale1, scale2);
+    }
+
+    for(int i = 0; i < N; ++i)
+        EXPECT_EQ(ref[i], out[i]) << "i:" << i;
+}
diff --git a/test/ck_tile/reduce/CMakeLists.txt b/test/ck_tile/reduce/CMakeLists.txt
new file mode 100644
index 0000000000..052669e20a
--- /dev/null
+++ b/test/ck_tile/reduce/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_ck_tile_reduce2d test_reduce2d.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_ck_tile_reduce2d PRIVATE utility)
+    endif()
+endif()
+
diff --git a/test/ck_tile/reduce/test_reduce2d.cpp b/test/ck_tile/reduce/test_reduce2d.cpp
new file mode 100644
index 0000000000..4ce0b56ef3
--- /dev/null
+++ b/test/ck_tile/reduce/test_reduce2d.cpp
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <cmath>
+#include <tuple>
+#include <iostream>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+
+template <typename Tuple>
+class TestCkTileReduce : public ::testing::Test
+{
+    protected:
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using ComputeDataType = std::tuple_element_t<1, Tuple>;
+    using YDataType       = std::tuple_element_t<2, Tuple>;
+    using ReduceOpType    = std::tuple_element_t<3, Tuple>;
+    using BlockWarps_     = std::tuple_element_t<4, Tuple>;
+    using BlockTile_      = std::tuple_element_t<5, Tuple>;
+    using WarpTile_       = std::tuple_element_t<6, Tuple>;
+    using ThreadTile_     = std::tuple_element_t<7, Tuple>;
+
+    using TestReduce2dShape =
+        ck_tile::Reduce2dShape<BlockWarps_, BlockTile_, WarpTile_, ThreadTile_>;
+
+    template <std::size_t InputDim, typename KeptDimSeq, typename ReduceDimSeq>
+    void RunGenericTest(const std::vector<ck_tile::index_t>& input_shape,
+                        const std::vector<ck_tile::index_t>& input_strides,
+                        const std::vector<ck_tile::index_t>& output_shape,
+                        const std::vector<ck_tile::index_t>& output_strides,
+                        ck_tile::index_t kept_dim_len_prod,
+                        ck_tile::index_t total_reduce_elements,
+                        KeptDimSeq kept_dims,
+                        ReduceDimSeq reduce_dims)
+    {
+        ck_tile::HostTensor<XDataType> h_x(input_shape, input_strides);
+        ck_tile::HostTensor<YDataType> h_y(output_shape, output_strides);
+        ck_tile::HostTensor<YDataType> h_y_ref(output_shape, output_strides);
+
+        ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(h_x);
+        h_y.SetZero();
+        h_y_ref.SetZero();
+
+        ck_tile::DeviceMem d_x_mem(h_x.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem d_y_mem(h_y.get_element_space_size_in_bytes());
+
+        d_x_mem.ToDevice(h_x.data());
+        d_y_mem.ToDevice(h_y.data()); // Initialize device output buffer
+
+        // Problem and kernel setup
+        using Problem = ck_tile::
+            Reduce2dProblem<XDataType, ComputeDataType, YDataType, TestReduce2dShape, ReduceOpType>;
+
+        using Kernel = ck_tile::Reduce<Problem>;
+
+        // Launch configuration
+        constexpr ck_tile::index_t kBlockSize  = 256;
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+
+        ck_tile::index_t kGridSize =
+            (kept_dim_len_prod + TestReduce2dShape::Block_M - 1) / TestReduce2dShape::Block_M;
+
+        // Generic helper to create tuple from vector based on compile-time size
+        auto make_shape_tuple = []<std::size_t N>(const std::vector<ck_tile::index_t>& vec) {
+            return [&vec]<std::size_t... I>(std::index_sequence<I...>) {
+                return ck_tile::make_tuple(vec[I]...);
+            }(std::make_index_sequence<N>{});
+        };
+
+        auto input_shape_tuple   = make_shape_tuple.template operator()<InputDim>(input_shape);
+        auto input_strides_tuple = make_shape_tuple.template operator()<InputDim>(input_strides);
+
+        if(!Kernel::IsSupportedArgument(
+               output_shape[output_shape.size() - 1],
+               input_strides_tuple)) // output tensor's continuous dimension
+        {
+            throw std::runtime_error("Wrong! Arguments not supported!\n");
+        }
+
+        ck_tile::launch_kernel(ck_tile::stream_config{nullptr, false, 0},
+                               ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                   Kernel{},
+                                   kGridSize,
+                                   kBlockSize,
+                                   0,
+                                   static_cast<XDataType*>(d_x_mem.GetDeviceBuffer()),
+                                   static_cast<YDataType*>(d_y_mem.GetDeviceBuffer()),
+                                   input_shape_tuple,
+                                   input_strides_tuple,
+                                   kept_dims,
+                                   reduce_dims));
+
+        // Get results back
+        d_y_mem.FromDevice(h_y.data());
+
+        // Reference computation
+        ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
+            h_x, h_y_ref, ReduceOpType{}, kept_dims, reduce_dims);
+
+        // Calculate proper error thresholds based on data types and number of accumulations
+        const auto rtol = ck_tile::get_relative_threshold<XDataType, YDataType, ComputeDataType>(
+            total_reduce_elements);
+        const auto atol = ck_tile::get_absolute_threshold<XDataType, YDataType, ComputeDataType>(
+            5.0f, total_reduce_elements);
+
+        bool result =
+            ck_tile::check_err(h_y, h_y_ref, "Error: Incorrect reduce results!", rtol, atol);
+        EXPECT_TRUE(result);
+    }
+
+    // Convenience functions for specific dimensional patterns
+    void RunTest2D_KeepDim0_ReduceDim1(ck_tile::index_t dim0, ck_tile::index_t dim1)
+    {
+        constexpr auto kept_dims   = ck_tile::sequence<0>{};
+        constexpr auto reduce_dims = ck_tile::sequence<1>{};
+
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape   = {dim0, dim1};
+        std::vector<ck_tile::index_t> input_strides = {dim1, 1};
+
+        // Output shape and strides (keep dim0)
+        std::vector<ck_tile::index_t> output_shape   = {dim0};
+        std::vector<ck_tile::index_t> output_strides = {1};
+
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod     = dim0;
+        ck_tile::index_t total_reduce_elements = dim1;
+
+        RunGenericTest<2>(input_shape,
+                          input_strides,
+                          output_shape,
+                          output_strides,
+                          kept_dim_len_prod,
+                          total_reduce_elements,
+                          kept_dims,
+                          reduce_dims);
+    }
+
+    void RunTest3D_KeepDim0_ReduceDim12(ck_tile::index_t dim0,
+                                        ck_tile::index_t dim1,
+                                        ck_tile::index_t dim2)
+    {
+        constexpr auto kept_dims   = ck_tile::sequence<0>{};
+        constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
+
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape   = {dim0, dim1, dim2};
+        std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
+
+        // Output shape and strides (keep dim0)
+        std::vector<ck_tile::index_t> output_shape   = {dim0};
+        std::vector<ck_tile::index_t> output_strides = {1};
+
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod     = dim0;        // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = dim1 * dim2; // product of reduced dimensions
+
+        RunGenericTest<3>(input_shape,
+                          input_strides,
+                          output_shape,
+                          output_strides,
+                          kept_dim_len_prod,
+                          total_reduce_elements,
+                          kept_dims,
+                          reduce_dims);
+    }
+
+    void RunTest3D_KeepDim01_ReduceDim2(ck_tile::index_t dim0,
+                                        ck_tile::index_t dim1,
+                                        ck_tile::index_t dim2)
+    {
+        constexpr auto kept_dims   = ck_tile::sequence<0, 1>{};
+        constexpr auto reduce_dims = ck_tile::sequence<2>{};
+
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape   = {dim0, dim1, dim2};
+        std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
+
+        // Output shape and strides (keep dim0)
+        std::vector<ck_tile::index_t> output_shape   = {dim0, dim1};
+        std::vector<ck_tile::index_t> output_strides = {dim1, 1};
+
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod     = dim0 * dim1; // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = dim2;        // product of reduced dimensions
+
+        RunGenericTest<3>(input_shape,
+                          input_strides,
+                          output_shape,
+                          output_strides,
+                          kept_dim_len_prod,
+                          total_reduce_elements,
+                          kept_dims,
+                          reduce_dims);
+    }
+
+    void RunTest4D_KeepDim01_ReduceDim23(ck_tile::index_t N,
+                                         ck_tile::index_t C,
+                                         ck_tile::index_t H,
+                                         ck_tile::index_t W)
+    {
+        constexpr auto kept_dims   = ck_tile::sequence<0, 1>{};
+        constexpr auto reduce_dims = ck_tile::sequence<2, 3>{};
+
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape   = {N, C, H, W};
+        std::vector<ck_tile::index_t> input_strides = {C * H * W, H * W, W, 1};
+
+        // Output shape and strides (keep dim0, dim1)
+        std::vector<ck_tile::index_t> output_shape   = {N, C};
+        std::vector<ck_tile::index_t> output_strides = {C, 1};
+
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod     = N * C; // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions
+
+        RunGenericTest<4>(input_shape,
+                          input_strides,
+                          output_shape,
+                          output_strides,
+                          kept_dim_len_prod,
+                          total_reduce_elements,
+                          kept_dims,
+                          reduce_dims);
+    }
+
+    void RunTest4D_KeepDim03_ReduceDim12(ck_tile::index_t N,
+                                         ck_tile::index_t H,
+                                         ck_tile::index_t W,
+                                         ck_tile::index_t C)
+    {
+        constexpr auto kept_dims   = ck_tile::sequence<0, 3>{};
+        constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
+
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape   = {N, H, W, C};
+        std::vector<ck_tile::index_t> input_strides = {H * W * C, W * C, C, 1};
+
+        // Output shape and strides (keep dim0, dim1)
+        std::vector<ck_tile::index_t> output_shape   = {N, C};
+        std::vector<ck_tile::index_t> output_strides = {C, 1};
+
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod     = N * C; // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = H * W; // product of reduced dimensions
+
+        RunGenericTest<4>(input_shape,
+                          input_strides,
+                          output_shape,
+                          output_strides,
+                          kept_dim_len_prod,
+                          total_reduce_elements,
+                          kept_dims,
+                          reduce_dims);
+    }
+};
+
+// Shape parameters for different test configurations
+using Shape1_BlockWarps = ck_tile::sequence<4, 1>;
+using Shape1_BlockTile  = ck_tile::sequence<128, 128>;
+using Shape1_WarpTile   = ck_tile::sequence<32, 128>;
+using Shape1_ThreadTile = ck_tile::sequence<8, 8>;
+
+using Shape2_BlockWarps = ck_tile::sequence<2, 2>; // Cross-warp reduction test
+using Shape2_BlockTile  = ck_tile::sequence<2, 1024>;
+using Shape2_WarpTile   = ck_tile::sequence<1, 512>;
+using Shape2_ThreadTile = ck_tile::sequence<1, 8>;
+
+// Test configurations for different data types and operations
+using TestConfig_F32_Add = std::tuple<float,
+                                      float,
+                                      float,
+                                      ck_tile::ReduceOp::Add,
+                                      Shape1_BlockWarps,
+                                      Shape1_BlockTile,
+                                      Shape1_WarpTile,
+                                      Shape1_ThreadTile>;
+
+using TestConfig_F16_Add = std::tuple<ck_tile::half_t,
+                                      float,
+                                      ck_tile::half_t,
+                                      ck_tile::ReduceOp::Add,
+                                      Shape1_BlockWarps,
+                                      Shape1_BlockTile,
+                                      Shape1_WarpTile,
+                                      Shape1_ThreadTile>;
+
+using TestConfig_F32_CrossWarp = std::tuple<float,
+                                            float,
+                                            float,
+                                            ck_tile::ReduceOp::Add,
+                                            Shape2_BlockWarps,
+                                            Shape2_BlockTile,
+                                            Shape2_WarpTile,
+                                            Shape2_ThreadTile>;
+
+using TestConfig_F32_Max = std::tuple<float,
+                                      float,
+                                      float,
+                                      ck_tile::ReduceOp::Max,
+                                      Shape1_BlockWarps,
+                                      Shape1_BlockTile,
+                                      Shape1_WarpTile,
+                                      Shape1_ThreadTile>;
+
+using TestConfig_F32_SquareAdd = std::tuple<float,
+                                            float,
+                                            float,
+                                            ck_tile::ReduceOp::SquareAdd,
+                                            Shape1_BlockWarps,
+                                            Shape1_BlockTile,
+                                            Shape1_WarpTile,
+                                            Shape1_ThreadTile>;
+
+using TestTypes = ::testing::Types<TestConfig_F32_Add,
+                                   TestConfig_F16_Add,
+                                   TestConfig_F32_CrossWarp,
+                                   TestConfig_F32_Max,
+                                   TestConfig_F32_SquareAdd>;
+
+TYPED_TEST_SUITE(TestCkTileReduce, TestTypes);
+
+// 2D Tests - Keep dim0, reduce dim1
+TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_64x32)
+{
+    this->RunTest2D_KeepDim0_ReduceDim1(64, 32);
+}
+
+TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_1024x512)
+{
+    this->RunTest2D_KeepDim0_ReduceDim1(1024, 512);
+}
+
+// 3D Tests - Keep dim0, reduce dim1,2
+TYPED_TEST(TestCkTileReduce, Test3D_KeepDim0_ReduceDim12_128x128x1)
+{
+    this->RunTest3D_KeepDim0_ReduceDim12(128, 128, 8);
+}
+// 3D Tests - Keep dim0,1, reduce dim1
+TYPED_TEST(TestCkTileReduce, Test3D_KeepDim01_ReduceDim2_512x1024x16)
+{
+    this->RunTest3D_KeepDim01_ReduceDim2(512, 1024, 16);
+}
+
+// 4D Tests - Keep dim0,1, reduce dim2,3 (NCHW -> NC)
+TYPED_TEST(TestCkTileReduce, Test4D_KeepDim01_ReduceDim23_32x256x16x16)
+{
+    this->RunTest4D_KeepDim01_ReduceDim23(32, 256, 16, 16);
+}
+// 4D Tests - Keep dim0,3, reduce dim1,2 (NHWC -> NC)
+TYPED_TEST(TestCkTileReduce, Test4D_KeepDim03_ReduceDim12_16x32x32x128)
+{
+    this->RunTest4D_KeepDim03_ReduceDim12(16, 32, 32, 128);
+}
diff --git a/test/ck_tile/utility/CMakeLists.txt b/test/ck_tile/utility/CMakeLists.txt
new file mode 100644
index 0000000000..c57cafca5a
--- /dev/null
+++ b/test/ck_tile/utility/CMakeLists.txt
@@ -0,0 +1,4 @@
+message("-- Adding: test/ck_tile/utility/")
+
+# Add print tests
+add_subdirectory(print)
diff --git a/test/ck_tile/utility/print/CMakeLists.txt b/test/ck_tile/utility/print/CMakeLists.txt
new file mode 100644
index 0000000000..5300dd20ca
--- /dev/null
+++ b/test/ck_tile/utility/print/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Print utility tests
+add_gtest_executable(test_print_sequence test_print_sequence.cpp)
+add_gtest_executable(test_print_array test_print_array.cpp)
+add_gtest_executable(test_print_tuple test_print_tuple.cpp)
+add_gtest_executable(test_print_coordinate_transform test_print_coordinate_transform.cpp)
+add_gtest_executable(test_print_static_encoding_pattern test_print_static_encoding_pattern.cpp)
+add_gtest_executable(test_print_buffer_view test_print_buffer_view.cpp)
+add_gtest_executable(test_print_basic_types test_print_basic_types.cpp)
diff --git a/test/ck_tile/utility/print/README.md b/test/ck_tile/utility/print/README.md
new file mode 100644
index 0000000000..558c6faee4
--- /dev/null
+++ b/test/ck_tile/utility/print/README.md
@@ -0,0 +1,70 @@
+# Print Function Tests
+
+This directory contains unit tests for testing the print functionality of various data structures and coordinate transformations in the composable_kernel library.
+
+## Tests Included
+
+### test_print_sequence.cpp
+Tests the print functionality for `sequence<...>` containers:
+- Simple sequences with multiple elements
+- Single element sequences
+- Empty sequences
+- Longer sequences
+
+### test_print_array.cpp
+Tests the print functionality for `array<T, N>` containers:
+- Arrays with integer values
+- Single element arrays
+- Empty arrays (size 0)
+- Arrays with floating point values
+
+### test_print_tuple.cpp
+Tests the print functionality for `tuple<...>` containers:
+- Simple tuples with numbers
+- Single element tuples
+- Empty tuples
+- Mixed type tuples
+
+### test_print_coordinate_transform.cpp
+Tests the print functionality for coordinate transformation structures:
+- `pass_through` transform
+- `embed` transform
+- `merge` transform
+- `unmerge` transform
+- `freeze` transform
+
+## Testing Approach
+
+All tests use Google Test's `CaptureStdout()` functionality to capture the output from print functions and verify the formatting:
+
+```cpp
+testing::internal::CaptureStdout();
+print(object);
+std::string output = testing::internal::GetCapturedStdout();
+EXPECT_EQ(output, "expected_format");
+```
+
+This approach enables testing of print function output without affecting the console during test execution.
+
+## Building and Running
+
+The tests are integrated into the CMake build system. To build and run the print tests:
+
+```bash
+# Build the specific test
+make test_print_sequence
+
+# Run the test
+./test_print_sequence
+
+# Or run all print tests using CTest
+ctest -R "test_print"
+```
+
+## Adding New Tests
+
+To add tests for new data structures:
+
+1. Create a new test file: `test_print_<structure_name>.cpp`
+2. Follow the existing pattern using `CaptureStdout()`
+3. Add the test executable to `CMakeLists.txt`
diff --git a/test/ck_tile/utility/print/test_print_array.cpp b/test/ck_tile/utility/print/test_print_array.cpp
new file mode 100644
index 0000000000..2fe9bc2a0c
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_array.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/container/array.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+namespace ck_tile {
+
+class PrintArrayTest : public PrintTest
+{
+};
+
+TEST_F(PrintArrayTest, PrintIntArray)
+{
+    // Test printing array<int, 3>
+    array<int, 3> arr{10, 20, 30};
+
+    std::string output = CapturePrintOutput(arr);
+
+    // The expected format should match the array print function implementation
+    EXPECT_EQ(output, "array{size: 3, data: [10, 20, 30]}");
+}
+
+TEST_F(PrintArrayTest, PrintSingleElementArray)
+{
+    // Test printing array<int, 1>
+    array<int, 1> arr{42};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "array{size: 1, data: [42]}");
+}
+
+TEST_F(PrintArrayTest, PrintEmptyArray)
+{
+    // Test printing array<int, 0> (empty array)
+    array<int, 0> arr{};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "array{size: 0, data: []}");
+}
+
+TEST_F(PrintArrayTest, PrintFloatArray)
+{
+    // Test printing array with float values
+    array<float, 2> arr{3.14f, 2.71f};
+
+    std::string output = CapturePrintOutput(arr);
+
+    // Note: float printing format may vary, so we'll test for basic structure
+    EXPECT_TRUE(output.find("array{size: 2, data: [") == 0);
+    EXPECT_TRUE(output.find("3.14") != std::string::npos);
+    EXPECT_TRUE(output.find("2.71") != std::string::npos);
+    EXPECT_TRUE(output.find("]}") == output.length() - 2);
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_basic_types.cpp b/test/ck_tile/utility/print/test_print_basic_types.cpp
new file mode 100644
index 0000000000..7a26b6371a
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_basic_types.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+namespace ck_tile {
+
+class PrintBasicTypesTest : public PrintTest
+{
+};
+
+TEST_F(PrintBasicTypesTest, PrintIntArray)
+{
+    int arr[4] = {1, 2, 3, 4};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "[1, 2, 3, 4]");
+}
+
+TEST_F(PrintBasicTypesTest, PrintFloatArray)
+{
+    float arr[3] = {1.5f, 2.5f, 3.5f};
+
+    std::string output = CapturePrintOutput(arr);
+
+    // Note: floating point formatting may vary, so we check for key elements
+    EXPECT_TRUE(output.find("[") == 0);
+    EXPECT_TRUE(output.find("1.5") != std::string::npos);
+    EXPECT_TRUE(output.find("2.5") != std::string::npos);
+    EXPECT_TRUE(output.find("3.5") != std::string::npos);
+    EXPECT_TRUE(output.back() == ']');
+    EXPECT_TRUE(output.find(", ") != std::string::npos);
+}
+
+TEST_F(PrintBasicTypesTest, PrintDoubleArray)
+{
+    double arr[2] = {10.123, 20.456};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_TRUE(output.find("[") == 0);
+    EXPECT_TRUE(output.find("10.123") != std::string::npos);
+    EXPECT_TRUE(output.find("20.456") != std::string::npos);
+    EXPECT_TRUE(output.back() == ']');
+}
+
+TEST_F(PrintBasicTypesTest, PrintUnsignedIntArray)
+{
+    unsigned int arr[3] = {100u, 200u, 300u};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "[100, 200, 300]");
+}
+
+TEST_F(PrintBasicTypesTest, PrintCharArray)
+{
+    char arr[5] = {'a', 'b', 'c', 'd', 'e'};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "[a, b, c, d, e]");
+}
+
+TEST_F(PrintBasicTypesTest, PrintSingleElementArray)
+{
+    int arr[1] = {42};
+
+    std::string output = CapturePrintOutput(arr);
+
+    EXPECT_EQ(output, "[42]");
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_buffer_view.cpp b/test/ck_tile/utility/print/test_print_buffer_view.cpp
new file mode 100644
index 0000000000..66668a2103
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_buffer_view.cpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/tensor/buffer_view.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+namespace ck_tile {
+
+class PrintBufferViewTest : public PrintTest
+{
+};
+
+TEST_F(PrintBufferViewTest, PrintGenericBufferView)
+{
+    // Test printing generic address space buffer_view
+    float data[4] = {100.f, 200.f, 300.f, 400.f};
+    auto bv       = make_buffer_view<address_space_enum::generic>(&data, 4);
+
+    std::string output = CapturePrintOutput(bv);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("buffer_view{AddressSpace: generic") != std::string::npos);
+    EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
+    EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
+    EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
+    EXPECT_TRUE(output.find("}") != std::string::npos);
+}
+
+TEST_F(PrintBufferViewTest, PrintGlobalBufferView)
+{
+    // Test printing global address space buffer_view
+    float data[4] = {100.f, 200.f, 300.f, 400.f};
+    auto bv       = make_buffer_view<address_space_enum::global>(&data, 4);
+
+    std::string output = CapturePrintOutput(bv);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("buffer_view{AddressSpace: global") != std::string::npos);
+    EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
+    EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
+    EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
+    EXPECT_TRUE(output.find("}") != std::string::npos);
+}
+
+TEST_F(PrintBufferViewTest, PrintLdsBufferView)
+{
+    // Test printing LDS address space buffer_view
+    float data[4] = {100.f, 200.f, 300.f, 400.f};
+    auto bv       = make_buffer_view<address_space_enum::lds>(data, 4);
+
+    std::string output = CapturePrintOutput(bv);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("buffer_view{AddressSpace: lds") != std::string::npos);
+    EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
+    EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
+    EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
+    EXPECT_TRUE(output.find("}") != std::string::npos);
+}
+
+TEST_F(PrintBufferViewTest, PrintVgprBufferView)
+{
+    // Test printing VGPR address space buffer_view
+    float data[4] = {1.5f, 2.5f, 3.5f, 4.5f};
+    auto bv       = make_buffer_view<address_space_enum::vgpr>(data, 4);
+
+    std::string output = CapturePrintOutput(bv);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("buffer_view{AddressSpace: vgpr") != std::string::npos);
+    EXPECT_TRUE(output.find("p_data_:") != std::string::npos);
+    EXPECT_TRUE(output.find("buffer_size_:") != std::string::npos);
+    EXPECT_TRUE(output.find("invalid_element_value_:") != std::string::npos);
+    EXPECT_TRUE(output.find("}") != std::string::npos);
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_common.hpp b/test/ck_tile/utility/print/test_print_common.hpp
new file mode 100644
index 0000000000..3ba2270802
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_common.hpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <gtest/gtest.h>
+#include <gtest/gtest-spi.h>
+
+#include "ck_tile/core/utility/print.hpp"
+
+class PrintTest : public ::testing::Test
+{
+    protected:
+    void SetUp() override {}
+    void TearDown() override {}
+    // Helper function to capture and return the output of a print function
+    template <typename T>
+    std::string CapturePrintOutput(const T& type)
+    {
+        using namespace ck_tile;
+        testing::internal::CaptureStdout();
+        print(type);
+        return testing::internal::GetCapturedStdout();
+    }
+};
diff --git a/test/ck_tile/utility/print/test_print_coordinate_transform.cpp b/test/ck_tile/utility/print/test_print_coordinate_transform.cpp
new file mode 100644
index 0000000000..639b113eb7
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_coordinate_transform.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+namespace ck_tile {
+
+class PrintCoordinateTransformTest : public PrintTest
+{
+};
+
+TEST_F(PrintCoordinateTransformTest, PrintPassThrough)
+{
+    // Test printing pass_through transform
+    auto pt = make_pass_through_transform(number<32>{});
+
+    std::string output = CapturePrintOutput(pt);
+
+    // Verify it contains the pass_through identifier and some structure
+    EXPECT_TRUE(output.find("pass_through{") == 0);
+    EXPECT_TRUE(output.find("up_lengths_") != std::string::npos);
+    EXPECT_TRUE(output.back() == '}');
+}
+
+TEST_F(PrintCoordinateTransformTest, PrintEmbed)
+{
+    // Test printing embed transform
+    auto embed_transform = make_embed_transform(make_tuple(number<4>{}, number<8>{}),
+                                                make_tuple(number<1>{}, number<4>{}));
+
+    std::string output = CapturePrintOutput(embed_transform);
+
+    // Verify it contains the embed identifier and key fields
+    EXPECT_TRUE(output.find("embed{") == 0);
+    EXPECT_TRUE(output.find("up_lengths_") != std::string::npos);
+    EXPECT_TRUE(output.find("coefficients_") != std::string::npos);
+    EXPECT_TRUE(output.back() == '}');
+}
+
+TEST_F(PrintCoordinateTransformTest, PrintMerge)
+{
+    // Test printing merge transform
+    auto merge_transform = make_merge_transform(make_tuple(number<4>{}, number<8>{}));
+
+    std::string output = CapturePrintOutput(merge_transform);
+
+    // Verify it contains merge identifier and key fields
+    EXPECT_TRUE(output.find("merge") ==
+                0); // Could be merge_v2_magic_division or merge_v3_division_mod
+    EXPECT_TRUE(output.find("low_lengths_") != std::string::npos ||
+                output.find("up_lengths_") != std::string::npos);
+    EXPECT_TRUE(output.back() == '}');
+}
+
+TEST_F(PrintCoordinateTransformTest, PrintUnmerge)
+{
+    // Test printing unmerge transform
+    auto unmerge_transform = make_unmerge_transform(make_tuple(number<4>{}, number<8>{}));
+
+    std::string output = CapturePrintOutput(unmerge_transform);
+
+    // Verify it contains the unmerge identifier and key fields
+    EXPECT_TRUE(output.find("unmerge{") == 0);
+    EXPECT_TRUE(output.find("up_lengths_") != std::string::npos);
+    EXPECT_TRUE(output.back() == '}');
+}
+
+TEST_F(PrintCoordinateTransformTest, PrintFreeze)
+{
+    // Test printing freeze transform
+    auto freeze_transform = make_freeze_transform(number<5>{});
+
+    std::string output = CapturePrintOutput(freeze_transform);
+
+    // Verify it contains the freeze identifier and key fields
+    EXPECT_TRUE(output.find("freeze{") == 0);
+    EXPECT_TRUE(output.find("low_idx_") != std::string::npos);
+    EXPECT_TRUE(output.back() == '}');
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_sequence.cpp b/test/ck_tile/utility/print/test_print_sequence.cpp
new file mode 100644
index 0000000000..e73a9f7e33
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_sequence.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/utility/print.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+
+namespace ck_tile {
+
+class PrintSequenceTest : public PrintTest
+{
+};
+
+TEST_F(PrintSequenceTest, PrintSimpleSequence)
+{
+    // Test printing sequence<1, 5, 8>
+    constexpr auto seq = sequence<1, 5, 8>{};
+
+    std::string output = CapturePrintOutput(seq);
+
+    // Verify the output format
+    EXPECT_EQ(output, "sequence<1, 5, 8>");
+}
+
+TEST_F(PrintSequenceTest, PrintSingleElementSequence)
+{
+    // Test printing sequence<42>
+    constexpr auto seq = sequence<42>{};
+
+    std::string output = CapturePrintOutput(seq);
+
+    EXPECT_EQ(output, "sequence<42>");
+}
+
+TEST_F(PrintSequenceTest, PrintEmptySequence)
+{
+    // Test printing sequence<> (empty sequence)
+    constexpr auto seq = sequence<>{};
+
+    std::string output = CapturePrintOutput(seq);
+
+    EXPECT_EQ(output, "sequence<>");
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp b/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp
new file mode 100644
index 0000000000..d1cb408b5c
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_static_encoding_pattern.cpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+#include <sstream>
+
+namespace ck_tile {
+
+class PrintStaticEncodingPatternTest : public PrintTest
+{
+    protected:
+    void TestY0Y1Y2(const std::string& output, auto Y0, auto Y1, auto Y2)
+    {
+        std::stringstream expected;
+        expected << "<Y0, Y1, Y2>: <" << Y0 << ", " << Y1 << ", " << Y2 << ">";
+        EXPECT_TRUE(output.find(expected.str()) != std::string::npos);
+    }
+    void TestX0X1(const std::string& output, auto X0, auto X1)
+    {
+        std::stringstream expected;
+        expected << "<X0, X1>: <" << X0 << ", " << X1 << ">";
+        EXPECT_TRUE(output.find(expected.str()) != std::string::npos);
+    }
+};
+
+TEST_F(PrintStaticEncodingPatternTest, PrintThreadRakedPattern)
+{
+    // Test printing thread raked pattern
+    using PatternType =
+        TileDistributionEncodingPattern2D<64, 8, 16, 4, tile_distribution_pattern::thread_raked>;
+    PatternType pattern;
+
+    std::string output = CapturePrintOutput(pattern);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
+    EXPECT_TRUE(output.find("BlockSize:64") != std::string::npos);
+    EXPECT_TRUE(output.find("YPerTile:8") != std::string::npos);
+    EXPECT_TRUE(output.find("XPerTile:16") != std::string::npos);
+    EXPECT_TRUE(output.find("VecSize:4") != std::string::npos);
+    EXPECT_TRUE(output.find("thread_raked") != std::string::npos);
+    TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2);
+    TestX0X1(output, PatternType::X0, PatternType::X1);
+}
+
+TEST_F(PrintStaticEncodingPatternTest, PrintWarpRakedPattern)
+{
+    // Test printing warp raked pattern
+    using PatternType =
+        TileDistributionEncodingPattern2D<128, 16, 32, 8, tile_distribution_pattern::warp_raked>;
+    PatternType pattern;
+
+    std::string output = CapturePrintOutput(pattern);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
+    EXPECT_TRUE(output.find("BlockSize:128") != std::string::npos);
+    EXPECT_TRUE(output.find("YPerTile:16") != std::string::npos);
+    EXPECT_TRUE(output.find("XPerTile:32") != std::string::npos);
+    EXPECT_TRUE(output.find("VecSize:8") != std::string::npos);
+    EXPECT_TRUE(output.find("warp_raked") != std::string::npos);
+    TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2);
+    TestX0X1(output, PatternType::X0, PatternType::X1);
+}
+
+TEST_F(PrintStaticEncodingPatternTest, PrintBlockRakedPattern)
+{
+    // Test printing block raked pattern
+    using PatternType =
+        TileDistributionEncodingPattern2D<256, 32, 64, 16, tile_distribution_pattern::block_raked>;
+    PatternType pattern;
+
+    std::string output = CapturePrintOutput(pattern);
+
+    // Verify the output contains expected information
+    EXPECT_TRUE(output.find("TileDistributionEncodingPattern2D") != std::string::npos);
+    EXPECT_TRUE(output.find("BlockSize:256") != std::string::npos);
+    EXPECT_TRUE(output.find("YPerTile:32") != std::string::npos);
+    EXPECT_TRUE(output.find("XPerTile:64") != std::string::npos);
+    EXPECT_TRUE(output.find("VecSize:16") != std::string::npos);
+    EXPECT_TRUE(output.find("block_raked") != std::string::npos);
+    TestY0Y1Y2(output, PatternType::Y0, PatternType::Y1, PatternType::Y2);
+    TestX0X1(output, PatternType::X0, PatternType::X1);
+}
+
+} // namespace ck_tile
diff --git a/test/ck_tile/utility/print/test_print_tuple.cpp b/test/ck_tile/utility/print/test_print_tuple.cpp
new file mode 100644
index 0000000000..79aaf1b3af
--- /dev/null
+++ b/test/ck_tile/utility/print/test_print_tuple.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_print_common.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/utility/print.hpp"
+
+namespace ck_tile {
+
+class PrintTupleTest : public PrintTest
+{
+};
+
+TEST_F(PrintTupleTest, PrintSimpleTuple)
+{
+    // Test printing tuple with numbers
+    auto tup = make_tuple(number<1>{}, number<5>{}, number<8>{});
+
+    std::string output = CapturePrintOutput(tup);
+
+    // Verify the output format matches tuple print implementation
+    EXPECT_TRUE(output.find("tuple<") == 0);
+    EXPECT_TRUE(output.find("1") != std::string::npos);
+    EXPECT_TRUE(output.find("5") != std::string::npos);
+    EXPECT_TRUE(output.find("8") != std::string::npos);
+    EXPECT_TRUE(output.back() == '>');
+}
+
+TEST_F(PrintTupleTest, PrintSingleElementTuple)
+{
+    // Test printing tuple with single element
+    auto tup = make_tuple(number<42>{});
+
+    std::string output = CapturePrintOutput(tup);
+
+    EXPECT_TRUE(output.find("tuple<") == 0);
+    EXPECT_TRUE(output.find("42") != std::string::npos);
+    EXPECT_TRUE(output.back() == '>');
+}
+
+TEST_F(PrintTupleTest, PrintEmptyTuple)
+{
+    // Test printing empty tuple
+    auto tup = make_tuple();
+
+    std::string output = CapturePrintOutput(tup);
+
+    EXPECT_EQ(output, "tuple<>");
+}
+
+TEST_F(PrintTupleTest, PrintMixedTypeTuple)
+{
+    // Test printing tuple with mixed types (numbers and constants)
+    auto tup = make_tuple(number<10>{}, constant<20>{}, number<30>{});
+
+    std::string output = CapturePrintOutput(tup);
+
+    EXPECT_TRUE(output.find("tuple<") == 0);
+    EXPECT_TRUE(output.find("10") != std::string::npos);
+    EXPECT_TRUE(output.find("20") != std::string::npos);
+    EXPECT_TRUE(output.find("30") != std::string::npos);
+    EXPECT_TRUE(output.back() == '>');
+}
+
+} // namespace ck_tile
diff --git a/test/grouped_convnd_fwd_activation/CMakeLists.txt b/test/grouped_convnd_fwd_activation/CMakeLists.txt
index f964325c06..4d5196505c 100644
--- a/test/grouped_convnd_fwd_activation/CMakeLists.txt
+++ b/test/grouped_convnd_fwd_activation/CMakeLists.txt
@@ -1,4 +1,10 @@
 if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_grouped_convnd_fwd_bias_bnorm_clamp test_grouped_convnd_fwd_bias_bnorm_clamp.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
+
+    add_gtest_executable(test_grouped_convnd_fwd_gk_bias_bnorm_clamp test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp)
+    target_link_libraries(test_grouped_convnd_fwd_gk_bias_bnorm_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_bnorm_clamp_instance device_grouped_conv3d_fwd_bias_bnorm_clamp_instance)
+
     add_gtest_executable(test_grouped_convnd_fwd_bias_clamp test_grouped_convnd_fwd_bias_clamp.cpp)
     target_link_libraries(test_grouped_convnd_fwd_bias_clamp PRIVATE utility device_grouped_conv2d_fwd_bias_clamp_instance device_grouped_conv3d_fwd_bias_clamp_instance)
 
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
new file mode 100644
index 0000000000..bf96d11d53
--- /dev/null
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_bias_bnorm_clamp.cpp
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
+                                                                                  InLayout,
+                                                                                  WeiLayout,
+                                                                                  OutLayout,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  IndexType,
+                                                                                  false /*BiasGK*/>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->template Run<3>();
+}
diff --git a/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
new file mode 100644
index 0000000000..2400008ffa
--- /dev/null
+++ b/test/grouped_convnd_fwd_activation/test_grouped_convnd_fwd_gk_bias_bnorm_clamp.cpp
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_fwd_bias_bnorm_clamp_impl.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using BiasNormalizeInInferClamp = ck::tensor_operation::element_wise::BiasNormalizeInInferClamp;
+
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass &&
+                   ck::profiler::profile_grouped_conv_fwd_bias_clamp_impl<NDimSpatial,
+                                                                          InLayout,
+                                                                          WeiLayout,
+                                                                          OutLayout,
+                                                                          DataType,
+                                                                          DataType,
+                                                                          DataType,
+                                                                          DataType,
+                                                                          DataType,
+                                                                          IndexType,
+                                                                          true /*ElementwiseGK*/>(
+                       true,  // do_verification
+                       1,     // init_method: integer value
+                       false, // do_log
+                       false, // time_kernel
+                       param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->template Run<3>();
+}
diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp
index 4bb38a0c16..b2e615b9d8 100644
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp
@@ -187,11 +187,11 @@ __device__ AFragT load_A_col_major(AType const* input_ptr)
     auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_M);
     auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_M);
 
-    using ARawT = typename scalar_type<AFragT>::type;
-    using AScalarFragT =
-        vector_type<ARawT,
-                    BLOCK_M * BLOCK_K / WAVE_SIZE /
-                        (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
+    using ARawT        = typename scalar_type<AFragT>::type;
+    using AScalarFragT = typename vector_type<
+        ARawT,
+        BLOCK_M * BLOCK_K / WAVE_SIZE /
+            (ck::is_same_v<ck::remove_cvref_t<AType>, ck::f4x2_pk_t> ? 2 : 1)>::type;
 
     AScalarFragT fragA{};
 
@@ -319,8 +319,9 @@ __device__ AFragT load_A_row_major(AType const* input_ptr)
     // Flatten to 1D row_major offsets.
     auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
 
-    using ARawT         = typename scalar_type<AFragT>::type;
-    using AScalarChunkT = vector_type<ARawT, scalar_type<AFragT>::vector_size / num_chunks>::type;
+    using ARawT = typename scalar_type<AFragT>::type;
+    using AScalarChunkT =
+        typename vector_type<ARawT, scalar_type<AFragT>::vector_size / num_chunks>::type;
 
     union
     {
@@ -544,8 +545,9 @@ __device__ BFragT load_B_col_major(BType const* input_ptr)
 
     auto majorStepCoord2D = std::make_pair(chunk_offset, 0); // read a chunk from a col
 
-    using BRawT         = typename scalar_type<BFragT>::type;
-    using BScalarChunkT = vector_type<BRawT, scalar_type<BFragT>::vector_size / num_chunks>::type;
+    using BRawT = typename scalar_type<BFragT>::type;
+    using BScalarChunkT =
+        typename vector_type<BRawT, scalar_type<BFragT>::vector_size / num_chunks>::type;
 
     union
     {
@@ -780,7 +782,7 @@ struct store_C_col_major<CType, CFragT, 32, 32>
 
         // we can vector store 4 contiguous elements at a time.
         using CRawT        = typename scalar_type<CFragT>::type;
-        using CScalarFragT = vector_type<CRawT, VW>::type;
+        using CScalarFragT = typename vector_type<CRawT, VW>::type;
         union
         {
             CFragT frag;
@@ -940,12 +942,14 @@ __global__ void matmul(const packed_type_t<AType>* a, const packed_type_t<BType>
     assert(threadIdx.x < WAVE_SIZE);
     assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
 
-    using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
-    using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
+    using AFragT =
+        typename vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
+    using BFragT =
+        typename vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
 
-    using CFragT        = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
+    using CFragT        = typename vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
     using AccumFragT    = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
-    using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
+    using RawAccumFragT = typename vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
 
     // Create frags
     auto fragA   = AFragT{};
@@ -1019,14 +1023,16 @@ __global__ void matmul(const packed_type_t<AType>* a,
     assert(threadIdx.x < WAVE_SIZE);
     assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
 
-    using AFragT = vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
-    using BFragT = vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
+    using AFragT =
+        typename vector_type<PackedAType, BLOCK_M * BLOCK_K / WAVE_SIZE / packed_size_a>::type;
+    using BFragT =
+        typename vector_type<PackedBType, BLOCK_K * BLOCK_N / WAVE_SIZE / packed_size_b>::type;
 
-    using CFragT        = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
+    using CFragT        = typename vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
     using AccumFragT    = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
-    using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
-    using AScaleFragT   = vector_type<ScaleType, 1>::type;
-    using BScaleFragT   = vector_type<ScaleType, 1>::type;
+    using RawAccumFragT = typename vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
+    using AScaleFragT   = typename vector_type<ScaleType, 1>::type;
+    using BScaleFragT   = typename vector_type<ScaleType, 1>::type;
 
     // Create frags
     auto fragA   = AFragT{};
diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt
index d8200ed947..fe9b7802a7 100644
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -1,215 +1,148 @@
+
 set(GEMM_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM (semicolon-separated)")
 set(GEMM_LAYOUT "rcr" CACHE STRING "List of layout for GEMM (semicolon-separated)")
 
-# Pre-generate all kernel lists to avoid blocking during parallel builds
-foreach(dt IN LISTS GEMM_DATATYPE)
-    foreach(l IN LISTS GEMM_LAYOUT)
-        set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${dt}/${l}")
-        file(MAKE_DIRECTORY "${working_path}")
-
-        if (l STREQUAL "rcr")
-            set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
-        else()
-            set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
-        endif()
-
-        # Only run if files don't exist
-        if (NOT EXISTS "${working_path}/gemm_instance_blobs.txt")
-            execute_process(
-                COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py"
-                        --working_path "${working_path}"
-                        --datatype "${dt}"
-                        --layout "${l}"
-                        --config_json "${json_blob}"
-                        --list_blobs
-                RESULT_VARIABLE ret
-            )
-            if (NOT ret EQUAL 0)
-                message(FATAL_ERROR "Failed to pre-generate kernel list for ${dt} ${l}")
-            endif()
-        endif()
-    endforeach()
-endforeach()
-
 function(build_gemm_for_datatype datatype layout)
     set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}")
 
-    if (layout STREQUAL "rcr")
+    # Comment this if-else block when using user_provided_config
+    if(layout STREQUAL "rcr")
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
     else()
         set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json")
     endif()
-    # Uncomment to override:
-    # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
 
-    # Read pre-generated kernel lists
+    # uncomment this if you want to use user_provided_config.json
+    # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json")
+    
+    # Generate kernel list
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+                --working_path ${working_path}
+                --datatype ${datatype}
+                --layout ${layout}
+                --config_json ${json_blob}
+                --list_blobs
+        RESULT_VARIABLE ret
+    )
+    if(NOT ret EQUAL 0)
+        message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${ret}")
+    endif()
+
     file(STRINGS "${working_path}/gemm_instance_blobs.txt" codegen_blobs)
     file(STRINGS "${working_path}/gemm_instance_blobs_range.txt" codegen_blobs_range)
-
+    
     # Generate the blobs
     add_custom_command(
         OUTPUT ${codegen_blobs}
-        COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py"
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
                 --working_path "${working_path}"
-                --datatype "${datatype}"
-                --layout "${layout}"
+                --datatype ${datatype}
+                --layout ${layout}
                 --config_json "${json_blob}"
                 --gen_blobs
         COMMENT "Generating GEMM instance sources for ${datatype} ${layout}"
     )
     add_custom_target(gemm_gen_${datatype}_${layout} DEPENDS ${codegen_blobs})
 
-    # Parse ranges to identify unique trait names
-    set(unique_traits)
-    foreach(range_line IN LISTS codegen_blobs_range)
-        string(STRIP "${range_line}" stripped_line)
-        separate_arguments(split_line UNIX_COMMAND "${stripped_line}")
-        list(GET split_line 0 trait_name)
-        list(APPEND unique_traits "${trait_name}")
-    endforeach()
-    list(REMOVE_DUPLICATES unique_traits)
+    set(intermediate_libs)
+    list(LENGTH codegen_blobs codegen_blobs_len)
 
-    # Build each trait separately
-    foreach(trait IN LISTS unique_traits)
-        set(trait_files)
-        foreach(range_line IN LISTS codegen_blobs_range)
-            string(STRIP "${range_line}" stripped_line)
-            separate_arguments(split_line UNIX_COMMAND "${stripped_line}")
-            list(GET split_line 0 name)
-            if (name STREQUAL trait)
-                list(GET split_line 1 first)
-                list(GET split_line 2 last)
-                math(EXPR total_files "${last} - ${first}")
-                if (total_files GREATER 0)
-                    foreach(j RANGE ${first} ${last}-1)
-                        list(LENGTH codegen_blobs blobs_len)
-                        if (j LESS blobs_len)
-                            list(GET codegen_blobs ${j} f)
-                            list(APPEND trait_files "${f}")
-                        endif()
-                    endforeach()
-                endif()
-            endif()
-        endforeach()
+    foreach(blob IN LISTS codegen_blobs_range)
+        string(STRIP "${blob}" stripped_blob)
+        separate_arguments(spilit_blob UNIX_COMMAND "${stripped_blob}")
+        # Each line is: <trait_name> <first_index_inclusive> <last_index_exclusive>   
+        list(GET spilit_blob 0 name)
+        list(GET spilit_blob 1 first)
+        list(GET spilit_blob 2 last)
+        math(EXPR total_files "${last} - ${first}")
+        if(total_files EQUAL 0)
+            continue()        # nothing for this trait
+        endif()
 
-        if (trait_files)
-            # Create object libraries with chunking
-            set(chunk_size 3)  # adjust as needed for memory vs parallelism
-            list(LENGTH trait_files num_files)
-            math(EXPR num_chunks "( ${num_files} + ${chunk_size} - 1 ) / ${chunk_size}")
+        # Object libraries (chunked) per trait
+        set(sub_intermediate_libs)
+        set(chunk_size 3)
+        math(EXPR num_chunks "( ${total_files} + ${chunk_size} - 1 ) / ${chunk_size}")
+        math(EXPR num_chunks_minus_1 "${num_chunks} - 1")
+        
+        foreach(i RANGE 0 ${num_chunks_minus_1})
+            math(EXPR start "${first} + ${i} * ${chunk_size} ")
+            math(EXPR end "${start} + ${chunk_size} - 1")
 
-            set(trait_obj_libs)
-            foreach(i RANGE 0 ${num_chunks}-1)
-                math(EXPR start "${i} * ${chunk_size}")
-                math(EXPR end "${start} + ${chunk_size} - 1")
-
-                set(chunk_files)
-                foreach(j RANGE ${start} ${end})
-                    if (j LESS ${num_files})
-                        list(GET trait_files ${j} f)
-                        list(APPEND chunk_files "${f}")
-                    endif()
-                endforeach()
-
-                if (chunk_files)
-                    set(obj_lib_name "gemm_obj_${trait}_${i}_${datatype}_${layout}")
-                    add_library(${obj_lib_name} OBJECT ${chunk_files})
-                    add_dependencies(${obj_lib_name} gemm_gen_${datatype}_${layout})
-
-                    target_compile_options(${obj_lib_name} PRIVATE
-                        -Wno-undefined-func-template
-                        -Wno-float-equal
-                        --offload-compress
-                        -O3
-                        -fno-exceptions
-                    )
-
-                    set_target_properties(${obj_lib_name} PROPERTIES
-                        UNITY_BUILD ON
-                        UNITY_BUILD_BATCH_SIZE 2
-                    )
-
-                    list(APPEND trait_obj_libs "${obj_lib_name}")
+            set(chunk_files)
+            foreach(j RANGE ${start} ${end})
+                if(j LESS ${last} AND j LESS ${codegen_blobs_len})
+                    list(GET codegen_blobs ${j} f)
+                    list(APPEND chunk_files "${f}")
                 endif()
             endforeach()
 
-            # Static library for this trait
-            if (trait_obj_libs)
-                set(trait_lib_name "gemm_lib_${trait}_${datatype}_${layout}")
-                set(obj_exprs)
-                foreach(objlib IN LISTS trait_obj_libs)
-                    list(APPEND obj_exprs "$<TARGET_OBJECTS:${objlib}>")
-                endforeach()
-
-                add_library(${trait_lib_name} STATIC ${obj_exprs})
-                add_dependencies(${trait_lib_name} gemm_gen_${datatype}_${layout})
-
-                # Trait-specific executable
-                set(exec_name "benchmark_gemm_${datatype}_${layout}_${trait}")
-                add_executable(${exec_name} benchmark_gemm.cpp)
-                target_link_libraries(${exec_name} PRIVATE ${trait_lib_name})
-                target_include_directories(${exec_name} PRIVATE
-                    "${CMAKE_CURRENT_LIST_DIR}"
-                    "${working_path}"
-                )
-                target_compile_definitions(${exec_name} PRIVATE
-                    GEMM_TRAIT_FILTER="${trait}"
-                )
-                target_compile_options(${exec_name} PRIVATE
-                    -Wno-undefined-func-template
-                    -Wno-float-equal
-                    --offload-compress
-                )
+            #list(LENGTH chunk_files chunk_files_len)
+            #if(chunk_files_len AND chunk_files_len GREATER 1)
+            if(chunk_files)
+                set(sub_intermediate_lib_name "gemm_objlib_${name}_${i}_${datatype}_${layout}")
+                add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files})
+                list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name})
             endif()
-        endif()
-    endforeach()
 
-    # Master executable including all traits
-    set(all_trait_libs)
-    foreach(trait IN LISTS unique_traits)
-        if (TARGET gemm_lib_${trait}_${datatype}_${layout})
-            list(APPEND all_trait_libs "gemm_lib_${trait}_${datatype}_${layout}")
-        endif()
-    endforeach()
+        endforeach()
 
-    if (all_trait_libs)
-        add_executable(benchmark_gemm_${datatype}_${layout} benchmark_gemm.cpp)
-        target_link_libraries(benchmark_gemm_${datatype}_${layout} PRIVATE ${all_trait_libs})
-        target_include_directories(benchmark_gemm_${datatype}_${layout} PRIVATE
-            "${CMAKE_CURRENT_LIST_DIR}"
-            "${working_path}"
-        )
-        target_compile_options(benchmark_gemm_${datatype}_${layout} PRIVATE
-            -Wno-undefined-func-template
-            -Wno-float-equal
-            --offload-compress
-        )
-    endif()
+        # ------------------ Bundle the object libs into one static lib ---------
+        #list(LENGTH sub_intermediate_libs sub_intermediate_libs_len)
+        #if(sub_intermediate_libs AND sub_intermediate_libs_len GREATER 1)
+        if(sub_intermediate_libs)
+            set(intermediate_lib_name "gemm_staticlib_${name}_${datatype}_${layout}")
+            # Collect the $<TARGET_OBJECTS:...> expressions
+            
+            set(obj_exprs)
+            foreach(objlib IN LISTS sub_intermediate_libs)
+                list(APPEND obj_exprs $<TARGET_OBJECTS:${objlib}>)
+            endforeach()
+            
+            add_library(${intermediate_lib_name} STATIC ${obj_exprs})
+            add_dependencies(${intermediate_lib_name} gemm_gen_${datatype}_${layout})
+            #foreach(objlib IN LISTS sub_intermediate_libs)
+            #    target_sources(${intermediate_lib_name} PRIVATE $<TARGET_OBJECTS:${objlib}>)
+            #endforeach()
+            list(APPEND intermediate_libs ${intermediate_lib_name})
+        endif()
+
+    endforeach()
+    
+    # Interface library for instances
+    add_library(gemm_template_instances_${datatype}_${layout} INTERFACE)
+    add_dependencies(gemm_template_instances_${datatype}_${layout} gemm_gen_${datatype}_${layout})
+    target_link_libraries(gemm_template_instances_${datatype}_${layout} INTERFACE ${intermediate_libs})
+    target_include_directories(gemm_template_instances_${datatype}_${layout} INTERFACE
+        ${CMAKE_CURRENT_LIST_DIR}
+        "${working_path}"
+    )
+    set_target_properties(gemm_template_instances_${datatype}_${layout} PROPERTIES LINKER_LANGUAGE CXX)
+    
+    # Host API interface library
+    add_library(gemm_host_api_${datatype}_${layout} INTERFACE)
+    target_link_libraries(gemm_host_api_${datatype}_${layout} INTERFACE gemm_template_instances_${datatype}_${layout})
+    target_include_directories(gemm_host_api_${datatype}_${layout} INTERFACE
+        ${CMAKE_CURRENT_LIST_DIR}
+        "${working_path}"
+    )
+    
+
+    # Executable per datatype
+    set(exec_name "benchmark_gemm_${datatype}_${layout}")
+    add_executable(${exec_name} benchmark_gemm.cpp)
+    target_link_libraries(${exec_name} PRIVATE gemm_host_api_${datatype}_${layout})
+    target_compile_options(${exec_name} PRIVATE
+        -Wno-undefined-func-template
+        -Wno-float-equal
+        --offload-compress
+    )
 endfunction()
 
-# Process each datatype/layout
+# Process each datatype in isolation
 foreach(dt IN LISTS GEMM_DATATYPE)
     foreach(l IN LISTS GEMM_LAYOUT)
-        build_gemm_for_datatype("${dt}" "${l}")
+        build_gemm_for_datatype(${dt} ${l})
     endforeach()
 endforeach()
-
-# Master target for parallel builds
-set(ALL_GEMM_TARGETS)
-foreach(dt IN LISTS GEMM_DATATYPE)
-    foreach(l IN LISTS GEMM_LAYOUT)
-        list(APPEND ALL_GEMM_TARGETS "benchmark_gemm_${dt}_${l}")
-    endforeach()
-endforeach()
-add_custom_target(benchmark_gemm_all DEPENDS ${ALL_GEMM_TARGETS})
-
-# Use faster linker if available
-find_program(LLD_LINKER "ld.lld")
-find_program(MOLD_LINKER "mold")
-if (MOLD_LINKER)
-    message(STATUS "Using mold linker for faster linking")
-    add_link_options(-fuse-ld=mold)
-elseif (LLD_LINKER)
-    message(STATUS "Using lld linker for faster linking")
-    add_link_options(-fuse-ld=lld)
-endif()
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py
index 9ff76724cc..4a990f3309 100644
--- a/tile_engine/ops/gemm/codegen_utils.py
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -65,93 +65,6 @@ CSHUFFLE_EPILOGUE = """
                                                              UniversalGemmProblem::TransposeC,
                                                              memory_operation>>;
 """
-HOT_LOOP_FALSE = """
-            if(tail_num == ck_tile::TailNumber::Full)
-            {
-                RunSplitk(ck_tile::bool_constant<false>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Odd)
-            {
-                RunSplitk(ck_tile::bool_constant<false>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Even)
-            {
-                RunSplitk(ck_tile::bool_constant<false>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-            }
-            else
-            {
-                throw std::runtime_error("Num K loop must be larger than number of prefetech stages.");
-            }
-"""
-RUN_MEM = """
-            // Handle One and Full cases directly
-            if (tail_num == ck_tile::TailNumber::One) {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-            } else if (tail_num == ck_tile::TailNumber::Full) {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            
-            auto check_tail = [&](auto... TNs) {
-                ([&]{
-                    if constexpr(BaseGemmPipeline::PrefetchStages > static_cast<int>(decltype(TNs)::value)) {
-                        if(tail_num == decltype(TNs)::value) {
-                            RunSplitk(ck_tile::bool_constant<true>{},
-                                    ck_tile::integral_constant<ck_tile::TailNumber, decltype(TNs)::value>{});
-                        }
-                    }
-                }(), ...);
-            };
-
-            check_tail(
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{}
-            );
-"""
-
-RUN_COMPV3 = """
-            if(tail_num == ck_tile::TailNumber::Full)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Odd)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-            }
-            else if(tail_num == ck_tile::TailNumber::Even)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-            }
-            else
-            {
-                throw std::runtime_error("The tail number is wrong. It should be Full, Odd, or Even.");
-            }
-"""
-
-RUN_COMPV4 = """
-            if(tail_num == ck_tile::TailNumber::Three)
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-            }
-            else
-            {
-                RunSplitk(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-            }
-"""
-
 
 PIPELINE_MAP = {
     "mem": ["ck_tile::BaseGemmPipelineAgBgCrMem", "ck_tile::GemmPipelineAgBgCrMem"],
@@ -172,8 +85,6 @@ SCHEDULER_MAP = {
 
 EPILOGUE_MAP = {"default": DEFAULT_EPILOGUE, "cshuffle": CSHUFFLE_EPILOGUE}
 
-HOT_LOOP_TRUE = {"mem": RUN_MEM, "compv3": RUN_COMPV3, "compv4": RUN_COMPV4}
-
 
 def BOOL_MAP(b_):
     return {True: "true", False: "false"}[bool(b_)]
diff --git a/tile_engine/ops/gemm/configs/benchmark.json b/tile_engine/ops/gemm/configs/benchmark.json
index 1560698b77..def3ca4453 100644
--- a/tile_engine/ops/gemm/configs/benchmark.json
+++ b/tile_engine/ops/gemm/configs/benchmark.json
@@ -96,6 +96,12 @@
             "values": [
                 false
             ]
+        },
+        "persistent": {
+            "values": [
+                false,
+                true
+            ]
         }
     }
 }
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/configs/custom_ci_config.json b/tile_engine/ops/gemm/configs/custom_ci_config.json
index 9187fb01eb..ca6c7230fd 100644
--- a/tile_engine/ops/gemm/configs/custom_ci_config.json
+++ b/tile_engine/ops/gemm/configs/custom_ci_config.json
@@ -77,6 +77,12 @@
       "values": [
         false
       ]
+    },
+    "persistent": {
+      "values": [
+        false,
+        true
+      ]
     }
   }
 }
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/configs/default_config.json b/tile_engine/ops/gemm/configs/default_config.json
index 12a8ddd4b7..5bd51b809a 100644
--- a/tile_engine/ops/gemm/configs/default_config.json
+++ b/tile_engine/ops/gemm/configs/default_config.json
@@ -95,6 +95,11 @@
       "values": [
         false
       ]
+    },
+    "persistent": {
+      "values": [
+        false
+      ]
     }
   }
-}
\ No newline at end of file
+}
diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json
index 5761b39ada..76e194f6b9 100644
--- a/tile_engine/ops/gemm/configs/user_provided_config.json
+++ b/tile_engine/ops/gemm/configs/user_provided_config.json
@@ -82,6 +82,12 @@
       "values": [
         false
       ]
+    },
+    "persistent": {
+        "values": [
+            false,
+            true
+        ]
     }
   }
 }
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
index 2c4af8955f..f28f5dd29c 100644
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -144,7 +144,8 @@ inline auto create_args(int argc, char* argv[])
         .insert("pad_k",
                 "false",
                 "Whether pad or not in k direction. Possible values are true or false. Default is "
-                "false.");
+                "false.")
+        .insert("persistent", "false", "Whether to use persistent kernel. Default is false.");
 
     bool result = arg_parser.parse(argc, argv);
     return std::make_tuple(result, arg_parser);
@@ -208,12 +209,13 @@ void permute_vectors_i4x4_b(Tensor& tensor)
 auto get_kernel_func_by_trait(const ck_tile::ArgParser& arg_parser)
 {
     KernelTraits trait;
-    trait.pipeline  = arg_parser.get_str("pipeline");
-    trait.scheduler = arg_parser.get_str("scheduler");
-    trait.epilogue  = arg_parser.get_str("epilogue");
-    trait.pad_m     = arg_parser.get_bool("pad_m");
-    trait.pad_n     = arg_parser.get_bool("pad_n");
-    trait.pad_k     = arg_parser.get_bool("pad_k");
+    trait.pipeline   = arg_parser.get_str("pipeline");
+    trait.scheduler  = arg_parser.get_str("scheduler");
+    trait.epilogue   = arg_parser.get_str("epilogue");
+    trait.pad_m      = arg_parser.get_bool("pad_m");
+    trait.pad_n      = arg_parser.get_bool("pad_n");
+    trait.pad_k      = arg_parser.get_bool("pad_k");
+    trait.persistent = arg_parser.get_bool("persistent");
 
     bool structured_sparsity = arg_parser.get_bool("structured_sparsity");
 
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index 4a35a2bcd3..6d713bdcb8 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -15,16 +15,9 @@ from json_config import GemmConfig, RangeConfigParam
 from codegen_utils import (
     DATA_TYPE_MAP,
     LAYOUT_MAP,
-    DEFAULT_EPILOGUE,
-    CSHUFFLE_EPILOGUE,
-    HOT_LOOP_FALSE,
-    RUN_MEM,
-    RUN_COMPV3,
-    RUN_COMPV4,
     PIPELINE_MAP,
     SCHEDULER_MAP,
     EPILOGUE_MAP,
-    HOT_LOOP_TRUE,
     BOOL_MAP,
     warp_tile_supported_combinations,
     trait_unsupported_combinations,
@@ -114,7 +107,7 @@ class GemmCodeGenerator:
 
     def _generate_all_traits(self):
         """Generate all possible kernel traits names."""
-        params = ["pipeline", "epilogue", "scheduler", "pad_m", "pad_n", "pad_k"]
+        params = ["pipeline", "epilogue", "scheduler", "pad_m", "pad_n", "pad_k", "persistent"]
 
         # Generate all unique_combinations
         _unique = set(
@@ -124,13 +117,14 @@ class GemmCodeGenerator:
         )
 
         for combo in _unique:
-            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = combo
+            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent = combo
             current_combination = (pipeline, epilogue, scheduler)
 
             if current_combination not in trait_unsupported_combinations:
                 trait_name = (
                     f"{pipeline}_{epilogue}_{scheduler}_"
-                    f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}"
+                    f"{BOOL_MAP(pad_m)}_{BOOL_MAP(pad_n)}_{BOOL_MAP(pad_k)}_"
+                    f"{BOOL_MAP(persistent)}"
                 )
                 self.valid_trait_names.append(trait_name)
             else:
@@ -189,7 +183,7 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map["matrix_c"]]};
 
     def _generate_trait_file(self, trait: str):
         """Generate a trait with all tile/warp combinations."""
-        pipeline, epilogue, scheduler, pad_m, pad_n, pad_k = trait.split("_")
+        pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent = trait.split("_")
         filename = f"gemm_{trait}.hpp"
 
         content = f"""// SPDX-License-Identifier: MIT
@@ -206,8 +200,7 @@ namespace {trait} {{
 """
         # Add template struct with configuration
         content += self._generate_kernel_struct(
-            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k
-        )
+            pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent)
 
         content += f"\n}} // namespace {trait}\n"
         (self.output_dir / filename).write_text(content)
@@ -220,6 +213,7 @@ namespace {trait} {{
         pad_m: str,
         pad_n: str,
         pad_k: str,
+        persistent: str,
     ) -> str:
         """Generate the code block of kernel struct"""
         return f"""
@@ -229,9 +223,10 @@ template <int TileM, int TileN, int TileK,
           int WarpTileM, int WarpTileN, int WarpTileK,
           bool structured_sparsity>
 struct GemmKernel {{
-    static constexpr bool kPadM = {pad_m};
-    static constexpr bool kPadN = {pad_n};
-    static constexpr bool kPadK = {pad_k};
+    static constexpr bool kPadM       = {pad_m};
+    static constexpr bool kPadN       = {pad_n};
+    static constexpr bool kPadK       = {pad_k};
+    static constexpr bool kPersistent = {persistent};
 
     static float launch(ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {{
         static constexpr bool permuteA = false;
@@ -250,7 +245,6 @@ struct GemmKernel {{
                                    permuteA,
                                    permuteB>;
 
-
         using TilePartitioner =
             ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
                                                       TileParitionerGroupNum,
@@ -261,7 +255,8 @@ struct GemmKernel {{
 
         using GemmUniversalTraits =
             ck_tile::TileGemmUniversalTraits<kPadM, kPadN, kPadK, DoubleSmemBuffer,
-                                             ALayout, BLayout, CLayout, TransposeC, structured_sparsity>;
+                                             ALayout, BLayout, CLayout, TransposeC, 
+                                             structured_sparsity, kPersistent>;
 
         using GemmPipelineProblem =
             ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
@@ -297,14 +292,14 @@ struct GemmKernel {{
             using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
             auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-            constexpr dim3 blocks = Kernel::BlockSize();
-
             if(!Kernel::IsSupportedArgument(kargs))
             {{
                 throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!");
             }}
 
+            constexpr dim3 blocks = Kernel::BlockSize();
+            const dim3 grids = {'Kernel::MaxOccupancyGridSize(stream)' if persistent == 'true' else 'Kernel::GridSize(args.M, args.N, args.k_batch)'};
+
             if(stream.log_level_ > 0)
             {{
                 std::cout << "Launching kernel with args:"
@@ -377,11 +372,7 @@ struct GemmKernel {{
             }}
         }};
 
-        if(has_hot_loop) {{
-            {HOT_LOOP_TRUE[pipeline]}
-        }} else {{
-            {HOT_LOOP_FALSE}
-        }}
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
 
         return ave_time;
     }}
@@ -395,7 +386,8 @@ struct GemmKernel {{
                 "{pad_k}" + "_" +
                 "{pipeline}" + "_" +
                 "{epilogue}" + "_" +
-                "{scheduler}";
+                "{scheduler}" + "_" +
+                "{persistent}";
     }}
 }};
 """
@@ -673,6 +665,8 @@ struct KernelTraits
     bool pad_n;
     /// @brief Indicates whether padding is applied to the K dimension.
     bool pad_k;
+    /// @brief Indicates whether the kernel is persistent.
+    bool persistent;
 };
 
 struct GemmDispatcher {
@@ -773,7 +767,8 @@ private:
                trait.scheduler + "_" +
                (trait.pad_m ? "true" : "false") + "_" +
                (trait.pad_n ? "true" : "false") + "_" +
-               (trait.pad_k ? "true" : "false");
+               (trait.pad_k ? "true" : "false") + "_" +
+               (trait.persistent ? "true" : "false");
     }
 };
 
diff --git a/tile_engine/ops/gemm/json_config.py b/tile_engine/ops/gemm/json_config.py
index 675a2052ef..04f2dd4890 100644
--- a/tile_engine/ops/gemm/json_config.py
+++ b/tile_engine/ops/gemm/json_config.py
@@ -107,6 +107,7 @@ class TraitConfig:
     pad_m: EnumConfigParam
     pad_n: EnumConfigParam
     pad_k: EnumConfigParam
+    persistent: EnumConfigParam
 
 
 @dataclass
@@ -215,6 +216,9 @@ class GemmConfig:
                 pad_k=EnumConfigParam(
                     values=config_dict["trait_config"]["pad_k"]["values"]
                 ),
+                persistent=EnumConfigParam(
+                    values=config_dict["trait_config"]["persistent"]["values"]
+                ),
             )
 
             return cls(