Revert "Shard several of the most costly targets. (#2266)" (#2361)

This reverts commit 33c4b3be9d77ee5932c88a27d4364c4aab774de0. [ROCm/composable_kernel commit: cdfd7722bf]
2026-07-14 11:07:44 +00:00 · 2025-06-17 13:56:30 -07:00
parent f0d44c77d7
commit fcb07b7311
41 changed files with 820 additions and 1318 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -68,6 +68,3 @@ build*/

 # Python cache
 __pycache__/
-
-.cache/
-
--- a/cmake/ShardInstantiation.cmake
+++ b/cmake/ShardInstantiation.cmake
@@ -1,116 +0,0 @@
-# Function to generate templated instantiation functions and caller function.
-
-# In order to reduce build times, we split the instantiation of template functions into multiple files.
-# Developers can use ck::util::generate_sharded_instantiations to generate the instantiation functions,
-# which can be placed the TEMPLATE_FILE (typically a .in file).
-
-# This CMake function generates the instantiation functions and a caller function that calls all the instantiation 
-# functions. The ck::util::generate_sharded_instantiations function allows us to generate an arbitrary number of
-# shards (NUM_SHARDS). This function loops over the shards, generates an instantiation function for each shard,
-# and generates a caller function that calls all the instantiation functions.
-
-# The explicit instatiation pattern requires the use of `extern template` to avoid implicit instantiation
-# of the template functions in the caller function, and that code is automatically generated by this function.
-
-# In addition to the user-supplied template, this CMake function uses two generic templates:
-#
-# 1. `instantiate_shard.in`: This is the template for the instantiation functions.
-# 2. `call_shard.in`: This is the template for the caller function that calls all the instantiation functions.
-
-# This function takes the following arguments:
-#
-# - INSTANCES_NAME: The name of the instances (the calling function will be named `add_${INSTANCE_NAMES}`).
-# - TEMPLATE_FILE: The path to the template file that contains the templated instantiation function definitions.
-# - NUM_SHARDS: The number of shards to generate.
-# - OUTPUT_DIR: The build directory where the generated source files will be placed.
-# - SRC_LIST: The list of source files to which the generated source files will be added.
-
-
-function(generate_sharded_instantiations)
-    cmake_parse_arguments(
-        GEN_SHARDED
-        # No boolean arguments
-        ""
-        # Single-value arguments
-        "INSTANCES_NAME;TEMPLATE_FILE;NUM_SHARDS;OUTPUT_DIR;SRC_LIST"
-        # No multi-value arguments.
-        ""
-        ${ARGN}
-    )
-    if (NOT GEN_SHARDED_INSTANCES_NAME)
-        message(FATAL_ERROR "INSTANCES_NAME is required for generate_sharded_instantiations")
-    endif()
-    if (NOT GEN_SHARDED_TEMPLATE_FILE)
-        message(FATAL_ERROR "TEMPLATE_FILE is required for generate_sharded_instantiations")
-    endif()
-    if (NOT GEN_SHARDED_NUM_SHARDS)
-        message(FATAL_ERROR "NUM_SHARDS is required for generate_sharded_instantiations")
-    endif()
-    if(NOT GEN_SHARDED_OUTPUT_DIR)
-        message(FATAL_ERROR "OUTPUT_DIR is required for generate_sharded_instantiations")
-    endif()
-    if (NOT GEN_SHARDED_SRC_LIST)
-        message(FATAL_ERROR "SRC_LIST is required for generate_sharded_instantiations")
-    endif()
-
-    file(MAKE_DIRECTORY ${GEN_SHARDED_OUTPUT_DIR})
-
-
-    set(GENERATED_SOURCE_FILES "")
-    set(EXTERN_TEMPLATE_STATEMENTS "")
-    set(CALL_STATEMENTS "")
-    message(STATUS "Generating sharded instantiations for target: ${GEN_SHARDED_INSTANCES_NAME}")
-
-    set(INSTANCES "${GEN_SHARDED_INSTANCES_NAME}")
-    
-    # Generate the inc file with the template function defintions.
-    # This include file will hold the template function definitions and a using alias for all the shard
-    # instantiation functions.
-    configure_file(
-        "${GEN_SHARDED_TEMPLATE_FILE}"
-        "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.inc"
-        @ONLY
-    )
-
-    # Generate the sharded instantiation functions.
-    # This is where the build parallelization happens.
-    # Each of these source files will contain a single instantiation function for a shard,
-    # which will be called sequentially by the caller function.
-    set(INC_DIR "${GEN_SHARDED_INC_DIR}")
-    math(EXPR LAST_SHARD_ID "${GEN_SHARDED_NUM_SHARDS} - 1")
-    foreach(SHARD_ID RANGE 0 ${LAST_SHARD_ID})
-        set(NUM_SHARDS "${GEN_SHARDED_NUM_SHARDS}")
-        set(SHARD_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}_shard_${SHARD_ID}.cpp")
-        set(SHARD_FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/instantiate_shard.in")
-        configure_file(
-            "${SHARD_FUNCTION_TEMPLATE}"
-            "${SHARD_FUNCTION_PATH}"
-            @ONLY
-        )
-        list(APPEND GENERATED_SOURCE_FILES "${SHARD_FUNCTION_PATH}")
-        set(SHARDED_FUNCTION_NAME "add_${INSTANCES}_shard<${NUM_SHARDS}, ${SHARD_ID}>")
-        list(APPEND EXTERN_TEMPLATE_STATEMENTS "extern template void\n${SHARDED_FUNCTION_NAME}(\n  ${INSTANCES}& instances)")
-        list(APPEND CALL_STATEMENTS "  ${SHARDED_FUNCTION_NAME}(instances)")
-    endforeach()
-
-    # Join the include statements, the extern template declarations, and the call statements each
-    # into a single string for variable substitution in the caller function.
-    string(REPLACE ";" ";\n" INCLUDE_STATEMENTS "${INCLUDE_STATEMENTS}")
-    string(REPLACE ";" ";\n" CALL_STATEMENTS "${CALL_STATEMENTS}")
-    string(REPLACE ";" ";\n" EXTERN_TEMPLATE_STATEMENTS "${EXTERN_TEMPLATE_STATEMENTS}")
-
-    # Generate the caller function.
-    set(CALLER_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.cpp")
-    set(FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/call_shard.in")
-    configure_file(
-        "${FUNCTION_TEMPLATE}"
-        "${CALLER_FUNCTION_PATH}"
-        @ONLY
-    )
-    list(APPEND GENERATED_SOURCE_FILES "${CALLER_FUNCTION_PATH}")
-
-    # Add the generated source files to the list of source files.
-    # This allows the generated source files to be included in the build.
-    list(APPEND ${GEN_SHARDED_SRC_LIST} ${GENERATED_SOURCE_FILES})
-    set(${GEN_SHARDED_SRC_LIST} "${${GEN_SHARDED_SRC_LIST}}" PARENT_SCOPE)
-endfunction()
--- a/cmake/call_shard.in
+++ b/cmake/call_shard.in
@@ -1,15 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "@INSTANCES@.inc"
-
-namespace ck::tensor_operation::device::instance {
-
-@EXTERN_TEMPLATE_STATEMENTS@;
-
-void add_@INSTANCES@(
-    @INSTANCES@& instances) {
-@CALL_STATEMENTS@; 
-}
-
-} // namespace ck::tensor_operation::device::instance
--- a/cmake/instantiate_shard.in
+++ b/cmake/instantiate_shard.in
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "@INSTANCES@.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_@INSTANCES@_shard<@NUM_SHARDS@, @SHARD_ID@>(
-    @INSTANCES@& instances);
-} // namespace ck::tensor_operation::device::instance
--- a/include/ck/utility/filter_tuple.hpp
+++ b/include/ck/utility/filter_tuple.hpp
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <tuple>
-#include <type_traits>
-#include <utility>
-
-#include "ck/utility/functional.hpp"
-#include "ck/utility/sequence.hpp"
-
-namespace ck::util {
-
-template <typename Tuple, std::size_t Stride, std::size_t Offset>
-struct filter_tuple_by_modulo
-{
-    // Validate Stride and Offset.
-    static_assert(Stride > 0, "Offset must be positive.");
-    static_assert(Offset >= 0 && Offset < Stride,
-                  "Offset must be positive and less than the stride.");
-
-    // Generate filtered indices for this stride and offset.
-    static constexpr int new_size = (std::tuple_size_v<Tuple> + Stride - Offset - 1) / Stride;
-
-    template <std::size_t... Is>
-    static constexpr auto to_index(std::index_sequence<Is...>)
-    {
-        return std::index_sequence<(Offset + Is * Stride)...>{};
-    }
-
-    using filtered_indices = decltype(to_index(std::make_index_sequence<new_size>{}));
-
-    // Helper struct to construct the new tuple type from the filtered indices.
-    template <typename T, typename Indices>
-    struct make_filtered_tuple_type_impl;
-
-    template <typename T, std::size_t... Is>
-    struct make_filtered_tuple_type_impl<T, std::index_sequence<Is...>>
-    {
-        using type = std::tuple<std::tuple_element_t<Is, T>...>;
-    };
-
-    using type = typename make_filtered_tuple_type_impl<Tuple, filtered_indices>::type;
-};
-
-// Filter a tuple with a stride and offset.
-//
-// Tuple is a std::tuple or equivalent
-// Stride is a positive integer
-// Offset is a positive integer smaller than ofset
-//
-// Evaluates to a smaller tuple type from elements of T with stride M and offset I.
-//
-// Can be used to filter a tuple of types for sharded instantiations.
-template <typename Tuple, std::size_t Stride, std::size_t Offset>
-using filter_tuple_by_modulo_t = typename filter_tuple_by_modulo<Tuple, Stride, Offset>::type;
-
-// Example compile-time test:
-// using OriginalTuple =
-//    std::tuple<int, double, char, float, long, short, bool, char, long long, unsigned int>;
-// using NewTuple_Every3rdFrom2nd = filter_tuple_by_modulo_t<OriginalTuple, 3, 1>;
-// static_assert(std::is_same_v<NewTuple_Every3rdFrom2nd, std::tuple<double, long, char>>,
-//               "Test Case 1 Failed: Every 3rd from 2nd");
-
-} // namespace ck::util
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -688,6 +688,7 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
                                                                PassThrough,
                                                                PassThrough,
                                                                PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NGCDHW,
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -1,5 +1,5 @@
 # XDL_DL_WMMA_KERNELS
-set(GROUPED_CONV2D_FWD
+add_instance_library(device_grouped_conv2d_fwd_instance
   #xdl
   # GNHWC, GKYXC, GNHWK
   xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -19,6 +19,8 @@ set(GROUPED_CONV2D_FWD
   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
   # NGCHW, GKCYX, NGKHW
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instance.cpp
   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instance.cpp
@@ -44,10 +46,12 @@ set(GROUPED_CONV2D_FWD
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
   # NHWGC, GKYXC, NHWGK
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
   # NGCHW, GKCYX, NGKHW
   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instance.cpp
   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instance.cpp
@@ -67,6 +71,7 @@ set(GROUPED_CONV2D_FWD
   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
   # NGCHW, GKCYX, NGKHW
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instance.cpp
   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
@@ -100,47 +105,3 @@ set(GROUPED_CONV2D_FWD
  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 )
-# Add generated files for sharded instantiations.
-include(ShardInstantiation)
-
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances
-  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
-  NUM_SHARDS 16
-  SRC_LIST GROUPED_CONV2D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl
-)
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances
-  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
-  NUM_SHARDS 16
-  SRC_LIST GROUPED_CONV2D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl
-)
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances
-  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
-  NUM_SHARDS 16
-  SRC_LIST GROUPED_CONV2D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
-)
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
-  NUM_SHARDS 21
-  SRC_LIST GROUPED_CONV2D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
-  NUM_SHARDS 21
-  SRC_LIST GROUPED_CONV2D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-add_instance_library(device_grouped_conv2d_fwd_instance ${GROUPED_CONV2D_FWD})
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
@@ -1,14 +1,16 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
 #include "ck/host_utility/device_prop.hpp"
-#include "ck/utility/filter_tuple.hpp"

-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances =
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                NGCHW,
                                                                GKCYX,
@@ -20,23 +22,19 @@ using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances =
                                                                BF16,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>;
-
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances_shard([[maybe_unused]]
-    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(
        instances,
-        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
-                                                                                           NGCHW,
-                                                                                           GKCYX,
-                                                                                           Empty_Tuple,
-                                                                                           NGKHW,
-                                                                                           ConvFwdDefault>,
-                                           Shards,
-                                           ShardIndex>{});
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                        NGCHW,
+                                                        GKCYX,
+                                                        Empty_Tuple,
+                                                        NGKHW,
+                                                        ConvFwdDefault>{});
 }

-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -3,11 +3,13 @@

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"

-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances =
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                NGCHW,
                                                                GKCYX,
@@ -19,40 +21,32 @@ using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances =
                                                                BF16,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>;
-
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances_shard(
-    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                              NGCHW,
                                                                              GKCYX,
                                                                              Empty_Tuple,
                                                                              NGKHW,
-                                                                              ConvFwdDefault>,
-                                   Shards,
-                                   ShardIndex>{});
+                                                                              ConvFwdDefault>{});
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                              NGCHW,
                                                                              GKCYX,
                                                                              Empty_Tuple,
                                                                              NGKHW,
-                                                                              ConvFwd1x1P0>,
-                                   Shards,
-                                   ShardIndex>{});
+                                                                              ConvFwd1x1P0>{});
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                              NGCHW,
                                                                              GKCYX,
                                                                              Empty_Tuple,
                                                                              NGKHW,
-                                                                              ConvFwd1x1S1P0>,
-                                   Shards,
-                                   ShardIndex>{});
+                                                                              ConvFwd1x1S1P0>{});
 }

-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -3,11 +3,13 @@

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"

-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances =
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                NGCHW,
                                                                GKCYX,
@@ -19,40 +21,32 @@ using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances =
                                                                F16,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>;
-
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances_shard(
-    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                              NGCHW,
-                                                                              GKCYX,
-                                                                              Empty_Tuple,
-                                                                              NGKHW,
-                                                                              ConvFwdDefault>,
-                                   Shards,
-                                   ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwdDefault>{});
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                              NGCHW,
-                                                                              GKCYX,
-                                                                              Empty_Tuple,
-                                                                              NGKHW,
-                                                                              ConvFwd1x1P0>,
-                                   Shards,
-                                   ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwd1x1P0>{});
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                              NGCHW,
-                                                                              GKCYX,
-                                                                              Empty_Tuple,
-                                                                              NGKHW,
-                                                                              ConvFwd1x1S1P0>,
-                                   Shards,
-                                   ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                             NGCHW,
+                                                                             GKCYX,
+                                                                             Empty_Tuple,
+                                                                             NGKHW,
+                                                                             ConvFwd1x1S1P0>{});
 }

-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
@@ -1,62 +1,66 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"

-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                F32,
-                                                                F32,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
                                                                Empty_Tuple,
-                                                                F32,
+                                                                int8_t,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwdDefault,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave>{});
+
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1P0,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave>{});
+
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1S1P0,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Interwave>{});
 }

-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances_shard(
-    device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           Empty_Tuple,
-                                                           NHWGK,
-                                                           ConvFwdDefault,
-                                                           Interwave>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                      NHWGC,
-                                                                                      GKYXC,
-                                                                                      Empty_Tuple,
-                                                                                      NHWGK,
-                                                                                      ConvFwd1x1P0,
-                                                                                      Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           Empty_Tuple,
-                                                           NHWGK,
-                                                           ConvFwd1x1S1P0,
-                                                           Interwave>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                      NHWGC,
-                                                                                      GKYXC,
-                                                                                      Empty_Tuple,
-                                                                                      NHWGK,
-                                                                                      ConvFwdOddC,
-                                                                                      Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
@@ -1,62 +1,66 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"

-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                F16,
-                                                                F16,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
                                                                Empty_Tuple,
-                                                                F16,
+                                                                int8_t,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwdDefault,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave>{});
+
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave>{});
+
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1S1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                  NHWGC,
+                                                                                  GKYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NHWGK,
+                                                                                  ConvFwdOddC,
+                                                                                  Intrawave>{});
 }

-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances_shard(
-    device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           Empty_Tuple,
-                                                           NHWGK,
-                                                           ConvFwdDefault,
-                                                           Intrawave>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                      NHWGC,
-                                                                                      GKYXC,
-                                                                                      Empty_Tuple,
-                                                                                      NHWGK,
-                                                                                      ConvFwd1x1P0,
-                                                                                      Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                           NHWGC,
-                                                           GKYXC,
-                                                           Empty_Tuple,
-                                                           NHWGK,
-                                                           ConvFwd1x1S1P0,
-                                                           Intrawave>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                      NHWGC,
-                                                                                      GKYXC,
-                                                                                      Empty_Tuple,
-                                                                                      NHWGK,
-                                                                                      ConvFwdOddC,
-                                                                                      Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -11,6 +11,8 @@ set(GROUPED_CONV3D_FWD
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
@@ -30,13 +32,23 @@ set(GROUPED_CONV3D_FWD
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp

   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
+   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
   
-      xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
-xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
+   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
@@ -59,99 +71,6 @@ xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cp
   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
 )
-# Add generated files for sharded instantiations.
-include(ShardInstantiation)
-
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances
-  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
-  NUM_SHARDS 8
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl
-)
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances
-  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
-  NUM_SHARDS 8
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl
-)
-
-set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances
-  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
-  NUM_SHARDS 10
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
-)
-
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances
-  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
-  NUM_SHARDS 12
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances
-  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
-  NUM_SHARDS 12
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances
-  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
-  NUM_SHARDS 12
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
-)
-generate_sharded_instantiations(
-  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances
-  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
-  NUM_SHARDS 12
-  SRC_LIST GROUPED_CONV3D_FWD
-  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
-)

 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
    list(APPEND GROUPED_CONV3D_FWD
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        NDHWGK,
+                                                        ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NDHWGC,
+                                                                                   GKZYXC,
+                                                                                   Empty_Tuple,
+                                                                                   NDHWGK,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NDHWGC,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        NDHWGK,
+                                                        ConvFwd1x1S1P0>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGK,
+                                                                  ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                                  NDHWGC,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGK,
+                                                                  ConvFwd1x1S1P0>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                               NDHWGC,
+                                                               GKZYXC,
+                                                               Empty_Tuple,
+                                                               NDHWGK,
+                                                               ConvFwd1x1S1P0>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NDHWGC,
-                                                            GKZYXC,
-                                                            Empty_Tuple,
-                                                            NDHWGK,
-                                                            ConvFwdDefault>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NDHWGC,
-                                                            GKZYXC,
-                                                            Empty_Tuple,
-                                                            NDHWGK,
-                                                            ConvFwd1x1P0>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NDHWGC,
-                                                            GKZYXC,
-                                                            Empty_Tuple,
-                                                            NDHWGK,
-                                                            ConvFwd1x1S1P0>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
-
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       NDHWGK,
+                                                       ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NDHWGC,
+                                                                                  GKZYXC,
+                                                                                  Empty_Tuple,
+                                                                                  NDHWGK,
+                                                                                  ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NDHWGC,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       NDHWGK,
+                                                       ConvFwd1x1S1P0>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Empty_Tuple,
+                                                                 NDHWGK,
+                                                                 ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Empty_Tuple,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                 NDHWGC,
+                                                                 GKZYXC,
+                                                                 Empty_Tuple,
+                                                                 NDHWGK,
+                                                                 ConvFwd1x1S1P0>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              ConvFwdDefault>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              ConvFwd1x1P0>{});
+        add_device_operation_instances(
+            instances,
+            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                              NDHWGC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              NDHWGK,
+                                                              ConvFwd1x1S1P0>{});
+    }
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           Empty_Tuple,
-                                                           NDHWGK,
-                                                           ConvFwdDefault>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                                                      NDHWGC,
-                                                                                      GKZYXC,
-                                                                                      Empty_Tuple,
-                                                                                      NDHWGK,
-                                                                                      ConvFwd1x1P0>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                           NDHWGC,
-                                                           GKZYXC,
-                                                           Empty_Tuple,
-                                                           NDHWGK,
-                                                           ConvFwd1x1S1P0>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
-
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                                                   NGCDHW,
+                                                                                   GKCZYX,
+                                                                                   Empty_Tuple,
+                                                                                   NGKDHW,
+                                                                                   ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                        NGCDHW,
+                                                        GKCZYX,
+                                                        Empty_Tuple,
+                                                        NGKDHW,
+                                                        ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NGCDHW,
-                                                            GKCZYX,
-                                                            Empty_Tuple,
-                                                            NGKDHW,
-                                                            ConvFwdDefault>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NGCDHW,
-                                                            GKCZYX,
-                                                            Empty_Tuple,
-                                                            NGKDHW,
-                                                            ConvFwd1x1P0>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                            NGCDHW,
-                                                            GKCZYX,
-                                                            Empty_Tuple,
-                                                            NGKDHW,
-                                                            ConvFwd1x1S1P0>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
-
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwdDefault>,
-            Shards,
-            ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                                                      NGCDHW,
-                                                                                      GKCZYX,
-                                                                                      Empty_Tuple,
-                                                                                      NGKDHW,
-                                                                                      ConvFwd1x1P0>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwd1x1S1P0>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                              NGCDHW,
+                                                                              GKCZYX,
+                                                                              Empty_Tuple,
+                                                                              NGKDHW,
+                                                                              ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwdDefault>{});
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                             NGCDHW,
+                                                                             GKCZYX,
+                                                                             Empty_Tuple,
+                                                                             NGKDHW,
+                                                                             ConvFwd1x1S1P0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_1of8.cpp
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 0>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_2of8.cpp
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 1>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_3of8.cpp
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 2>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_4of8.cpp
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 3>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_5of8.cpp
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 4>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_6of8.cpp
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 5>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_7of8.cpp
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 6>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance_8of8.cpp
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.inc"
-
-namespace ck::tensor_operation::device::instance {
-template void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_sharded<8, 7>(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances);
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
@@ -1,14 +1,15 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"

-namespace ck::tensor_operation::device::instance {
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {

-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances =
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NGCDHW,
                                                                GKCZYX,
@@ -20,43 +21,35 @@ using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances =
                                                                BF16,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances& instances)
+                                                                PassThrough>>>& instances)
 {
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                  ConvFwdDefault>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                                  ConvFwdDefault,
+                                                                                  Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                  ConvFwd1x1P0>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                  ConvFwd1x1S1P0>,
-                                       Shards,
-                                       ShardIndex>{});
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Interwave>{});
 }

-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwdDefault,
-                                                           Interwave>,
-            Shards,
-            ShardIndex>{});
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                                                      NGCDHW,
-                                                                                      GKCZYX,
-                                                                                      Empty_Tuple,
-                                                                                      NGKDHW,
-                                                                                      ConvFwd1x1P0,
-                                                                                      Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwd1x1S1P0,
-                                                           Interwave>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwdDefault,
+                                                                                  Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1P0,
+                                                                                  Intrawave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  NGCDHW,
+                                                                                  GKCZYX,
+                                                                                  Empty_Tuple,
+                                                                                  NGKDHW,
+                                                                                  ConvFwd1x1S1P0,
+                                                                                  Intrawave>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"
-
-namespace ck::tensor_operation::device::instance {
-
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances =
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances& instances)
-{
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwdDefault,
-                                                           Intrawave>,
-            Shards,
-            ShardIndex>{});
-    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                                                      NGCDHW,
-                                                                                      GKCZYX,
-                                                                                      Empty_Tuple,
-                                                                                      NGKDHW,
-                                                                                      ConvFwd1x1P0,
-                                                                                      Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
-    add_device_operation_instances(
-        instances,
-        ck::util::filter_tuple_by_modulo_t<
-            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
-                                                           NGCDHW,
-                                                           GKCZYX,
-                                                           Empty_Tuple,
-                                                           NGKDHW,
-                                                           ConvFwd1x1S1P0,
-                                                           Intrawave>,
-            Shards,
-            ShardIndex>{});
-}
-
-} // namespace ck::tensor_operation::device::instance
-
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
@@ -1,14 +1,15 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"

-namespace ck::tensor_operation::device::instance {
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {

-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances =
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NGCDHW,
                                                                GKCZYX,
@@ -20,43 +21,35 @@ using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances =
                                                                F16,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances)
+                                                                PassThrough>>>& instances)
 {
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
                                                                                 NGCDHW,
                                                                                 GKCZYX,
                                                                                 Empty_Tuple,
                                                                                 NGKDHW,
-                                                                                 ConvFwdDefault>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
                                                                                 NGCDHW,
                                                                                 GKCZYX,
                                                                                 Empty_Tuple,
                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1P0>,
-                                       Shards,
-                                       ShardIndex>{});
-
-    add_device_operation_instances(
-        instances,
-        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave>{});
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
                                                                                 NGCDHW,
                                                                                 GKCZYX,
                                                                                 Empty_Tuple,
                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1S1P0>,
-                                       Shards,
-                                       ShardIndex>{});
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave>{});
 }

-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
@@ -3,11 +3,13 @@

 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"

-namespace ck::tensor_operation::device::instance {
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {

-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances =
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NGCDHW,
                                                                GKCZYX,
@@ -19,44 +21,35 @@ using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances
                                                                F16,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwdDefault,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave>{});
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1P0,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave>{});
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f16_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1S1P0,
-                                                                                     Interwave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave>{});
 }

-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp
@@ -3,11 +3,13 @@

 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"

-namespace ck::tensor_operation::device::instance {
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {

-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances =
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NGCDHW,
                                                                GKCZYX,
@@ -19,44 +21,35 @@ using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances
                                                                F32,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwdDefault,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Interwave>{});
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Interwave>{});
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1S1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Interwave>{});
 }

-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
@@ -3,11 +3,13 @@

 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/utility/filter_tuple.hpp"

-namespace ck::tensor_operation::device::instance {
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {

-using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances =
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NGCDHW,
                                                                GKCZYX,
@@ -19,44 +21,35 @@ using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances
                                                                F32,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>;
-template <int Shards, int ShardIndex>
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances_shard(
-    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances& instances)
+                                                                PassThrough>>>& instances)
 {
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwdDefault,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwdDefault,
+                                                                                 Intrawave>{});
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1P0,
+                                                                                 Intrawave>{});
    add_device_operation_instances(instances,
-                                   ck::util::filter_tuple_by_modulo_t<
-                                       device_grouped_conv_fwd_xdl_f32_mem_instances<3,
-                                                                                     NGCDHW,
-                                                                                     GKCZYX,
-                                                                                     Empty_Tuple,
-                                                                                     NGKDHW,
-                                                                                     ConvFwd1x1S1P0,
-                                                                                     Intrawave>,
-                                       Shards,
-                                       ShardIndex>{});
+                                   device_grouped_conv_fwd_xdl_f32_mem_instances<3,
+                                                                                 NGCDHW,
+                                                                                 GKCZYX,
+                                                                                 Empty_Tuple,
+                                                                                 NGKDHW,
+                                                                                 ConvFwd1x1S1P0,
+                                                                                 Intrawave>{});
 }

-} // namespace ck::tensor_operation::device::instance
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck