From b6540cb96a7f7c2b320c3b74d224cfb2e0f2bf77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Pietil=C3=A4?= <>
Date: Thu, 18 Dec 2025 06:31:22 -0500
Subject: [PATCH] List bwd instances.

---
 ...grouped_conv_bwd_algorithms_instances.json | 1257 +++++++++++++++++
 .../grouped_conv_bwd_algorithms_summary.md    |  460 ++++++
 .../grouped_conv_bwd_data_instances.json      |  473 +++++++
 .../grouped_conv_bwd_weight_instances.json    |  648 +++++++++
 4 files changed, 2838 insertions(+)
 create mode 100644 experimental/builder/instances/grouped_conv_bwd_algorithms_instances.json
 create mode 100644 experimental/builder/instances/grouped_conv_bwd_algorithms_summary.md
 create mode 100644 experimental/builder/instances/grouped_conv_bwd_data_instances.json
 create mode 100644 experimental/builder/instances/grouped_conv_bwd_weight_instances.json

diff --git a/experimental/builder/instances/grouped_conv_bwd_algorithms_instances.json b/experimental/builder/instances/grouped_conv_bwd_algorithms_instances.json
new file mode 100644
index 0000000000..fa09d6761c
--- /dev/null
+++ b/experimental/builder/instances/grouped_conv_bwd_algorithms_instances.json
@@ -0,0 +1,1257 @@
+{
+  "description": "Comprehensive listing of grouped convolution backward algorithms and their instances in the Composable Kernel library",
+  "version": "Complete analysis including all algorithms in static library",
+  "note": "All listed algorithms are part of the static library with pre-compiled instances",
+  
+  "algorithms": {
+    "backward_data": {
+      "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1": {
+        "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp",
+        "description": "XDL (Matrix Core) based backward data convolution with CShuffle optimization",
+        "instruction_set": "XDL/MFMA (AMD Matrix Cores)",
+        "status": "active",
+        "features": [
+          "CShuffle optimization",
+          "Multiple spatial dimensions (1D, 2D, 3D)",
+          "Prefetch pipelining",
+          "Supports multiple D tensors for fused operations"
+        ],
+        "data_types": ["FP16", "BF16", "FP32", "TF32", "FP8", "BF8"],
+        "specializations": ["ConvBwdDataDefault", "ConvBwdDataFilter1x1Stride1Pad0"],
+        "instance_header_files": [
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp",
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp",
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_scale_instance.hpp",
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+        ],
+        "instantiation_cpp_files": {
+          "2D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_tf32_instance.cpp",
+            "... (and many more variants)"
+          ],
+          "3D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/xdl/*.cpp"
+          ]
+        },
+        "supported_layouts": {
+          "2D": ["GNHWK/GKYXC/GNHWC", "NHWGK/GKYXC/NHWGC", "NGKHW/GKCYX/NGCHW"],
+          "3D": ["GNDHWK/GKZYXC/GNDHWC", "NDHWGK/GKZYXC/NDHWGC"]
+        }
+      },
+      "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle": {
+        "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp",
+        "description": "WMMA (Wave Matrix Multiply Accumulate) based backward data convolution",
+        "instruction_set": "WMMA (16x16 matrix operations)",
+        "status": "active",
+        "features": [
+          "CShuffle optimization",
+          "Flexible block sizes",
+          "Optimized for specific GPU architectures"
+        ],
+        "data_types": ["FP16", "INT8"],
+        "specializations": ["ConvBwdDataDefault", "ConvBwdDataFilter1x1Stride1Pad0"],
+        "instance_header_files": [
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_f16_instance.hpp",
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_i8_instance.hpp"
+        ],
+        "instantiation_cpp_files": {
+          "2D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp"
+          ],
+          "3D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/wmma/*.cpp"
+          ]
+        },
+        "wmma_configuration": {
+          "wmma_size": "16x16",
+          "K1_FP16": 8,
+          "K1_INT8": 16
+        }
+      }
+    },
+    
+    "backward_weight": {
+      "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3": {
+        "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp",
+        "description": "Latest XDL-based backward weight convolution with advanced optimizations (version 3)",
+        "instruction_set": "XDL/MFMA (AMD Matrix Cores)",
+        "version": "v3",
+        "status": "active_recommended",
+        "features": [
+          "Latest XDL implementation with CShuffle",
+          "Split-K support with auto-deduction",
+          "Multiple pipeline versions (v1, v2, v3, v4)",
+          "Pipeline schedulers (Intrawave, Interwave)",
+          "Dual LDS buffer support (v4 pipeline)",
+          "Support for non-power-of-2 block sizes"
+        ],
+        "data_types": ["FP16", "BF16", "FP32", "TF32"],
+        "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"],
+        "pipeline_versions": ["v1", "v2", "v3", "v4"],
+        "pipeline_schedulers": ["Intrawave", "Interwave"],
+        "instance_header_files": [
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+        ],
+        "instantiation_cpp_files": {
+          "2D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_v3_xdl_gnhwc_gkyxc_gnhwk_f16_*.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_v3_xdl_nhwgc_gkyxc_nhwgk_f16_*.cpp"
+          ],
+          "3D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_v3_xdl_*.cpp"
+          ]
+        },
+        "sample_instances": {
+          "FP16": [
+            {
+              "BlockSize": 64,
+              "MPerBlock": 32,
+              "NPerBlock": 32,
+              "K0PerBlock": 32,
+              "K1": 8,
+              "MPerXDL": 32,
+              "NPerXDL": 32,
+              "MXdlPerWave": 1,
+              "NXdlPerWave": 1
+            },
+            {
+              "BlockSize": 64,
+              "MPerBlock": 64,
+              "NPerBlock": 80,
+              "K0PerBlock": 32,
+              "K1": 8,
+              "MPerXDL": 16,
+              "NPerXDL": 16,
+              "MXdlPerWave": 4,
+              "NXdlPerWave": 5,
+              "note": "Irregular NPerBlock=80"
+            }
+          ]
+        }
+      },
+      
+      "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3": {
+        "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp",
+        "description": "Latest WMMA-based backward weight convolution (version 3)",
+        "instruction_set": "WMMA (16x16 matrix operations)",
+        "version": "v3",
+        "status": "active_recommended",
+        "features": [
+          "WMMA 16x16 matrix operations",
+          "CShuffle optimization",
+          "Pipeline versions and schedulers",
+          "Support for non-power-of-2 block sizes"
+        ],
+        "data_types": ["FP16", "BF16"],
+        "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"],
+        "instance_header_files": [
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp"
+        ],
+        "instantiation_cpp_files": {
+          "2D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_bf16_instance.cpp"
+          ],
+          "3D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/*.cpp"
+          ]
+        },
+        "sample_instances": {
+          "FP16": [
+            {
+              "BlockSize": 64,
+              "MPerBlock": 32,
+              "NPerBlock": 32,
+              "K0PerBlock": 32,
+              "K1": 8,
+              "MPerWmma": 16,
+              "NPerWmma": 16,
+              "MRepeat": 2,
+              "NRepeat": 1
+            },
+            {
+              "BlockSize": 128,
+              "MPerBlock": 96,
+              "NPerBlock": 128,
+              "K0PerBlock": 64,
+              "K1": 8,
+              "MRepeat": 6,
+              "NRepeat": 2,
+              "note": "Non-power-of-2 MPerBlock=96"
+            }
+          ]
+        }
+      },
+      
+      "DeviceGroupedConvBwdWeight_Xdl_CShuffle": {
+        "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp",
+        "description": "Original XDL-based backward weight convolution (version 1)",
+        "instruction_set": "XDL/MFMA",
+        "version": "v1",
+        "status": "active_legacy",
+        "features": [
+          "XDL/MFMA matrix core operations",
+          "CShuffle optimization",
+          "Supports transpose operations for NCHW layouts",
+          "Broader layout support than V3"
+        ],
+        "data_types": ["FP16", "BF16", "FP32", "TF32", "FP8", "BF8"],
+        "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"],
+        "instance_header_files": [
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+        ],
+        "instantiation_cpp_files": {
+          "1D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_*.cpp"
+          ],
+          "2D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_*.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_*.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_*.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_*.cpp"
+          ],
+          "3D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/*.cpp"
+          ]
+        },
+        "supported_layouts": {
+          "2D": ["GNHWC/GKYXC/GNHWK", "NHWGC/GKYXC/NHWGK", "NGCHW/GKCYX/NGKHW", "NGCHW/GKYXC/NGKHW"],
+          "3D": ["GNDHWC/GKZYXC/GNDHWK", "NDHWGC/GKZYXC/NDHWGK"]
+        }
+      },
+      
+      "DeviceGroupedConvBwdWeight_Wmma_CShuffle": {
+        "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp",
+        "description": "Original WMMA-based backward weight for 3D convolutions only",
+        "instruction_set": "WMMA (16x16 matrix operations)",
+        "version": "v1",
+        "status": "active_3d_only",
+        "features": [
+          "WMMA 16x16 matrix operations",
+          "CShuffle optimization",
+          "Specialized for 3D convolutions ONLY",
+          "Different template structure than other WMMA variants"
+        ],
+        "data_types": ["FP16", "BF16", "FP32"],
+        "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"],
+        "spatial_dimensions": ["3D only"],
+        "supported_layouts": {
+          "3D_only": ["NDHWGC/GKZYXC/NDHWGK", "GNDHWC/GKZYXC/GNDHWK"]
+        },
+        "note": "This is NOT a general-purpose WMMA algorithm. It only supports 3D convolutions."
+      },
+      
+      "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle": {
+        "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp",
+        "description": "Two-stage XDL-based backward weight for large convolutions",
+        "instruction_set": "XDL/MFMA",
+        "version": "two_stage",
+        "status": "active",
+        "features": [
+          "Two-stage computation with intermediate workspace",
+          "Better memory efficiency for large problems",
+          "XDL matrix core operations",
+          "Multiple pipeline versions (v1, v2, v5)",
+          "Group merging optimization (NumGroupsToMerge)",
+          "Supports irregular block sizes (e.g., MPerBlock=48, NPerBlock=80, NPerBlock=112)"
+        ],
+        "data_types": ["FP16", "BF16"],
+        "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"],
+        "pipeline_versions": ["v1", "v2", "v5"],
+        "instance_header_files": [
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+        ],
+        "instantiation_cpp_files": {
+          "2D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev*.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_*.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_*.cpp"
+          ],
+          "3D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/*.cpp"
+          ]
+        },
+        "special_features": {
+          "num_groups_to_merge": "Enables group merging for better performance (1, 2, 4, 8)",
+          "irregular_blocks": "Supports MPerBlock/NPerBlock values like 48, 80, 112, 208"
+        }
+      },
+      
+      "DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3": {
+        "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp",
+        "description": "Two-stage WMMA-based backward weight convolution",
+        "instruction_set": "WMMA (16x16 matrix operations)",
+        "version": "two_stage_v3",
+        "status": "active",
+        "features": [
+          "Two-stage computation",
+          "WMMA 16x16 matrix operations",
+          "Pipeline versions",
+          "Group merging optimization"
+        ],
+        "data_types": ["FP16", "BF16"],
+        "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"],
+        "instance_header_files": [
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp"
+        ],
+        "instantiation_cpp_files": {
+          "2D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_wmma_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_wmma_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp"
+          ],
+          "3D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/wmma/*.cpp"
+          ]
+        }
+      },
+      
+      "DeviceGroupedConvBwdWeight_DL": {
+        "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp",
+        "description": "Direct Load variant using different memory access pattern",
+        "instruction_set": "Direct Load (no shared memory for A/B matrices)",
+        "version": "dl",
+        "status": "active",
+        "features": [
+          "Direct load memory access pattern",
+          "No shared memory for A/B matrices",
+          "Suitable for specific problem sizes",
+          "Lower shared memory pressure"
+        ],
+        "data_types": ["FP16", "BF16/FP32 (mixed)", "FP32"],
+        "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"],
+        "instance_header_files": [
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp"
+        ],
+        "instantiation_cpp_files": {
+          "1D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp"
+          ],
+          "2D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp",
+            "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_*.cpp"
+          ],
+          "3D": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/*.cpp"
+          ]
+        },
+        "sample_instances": {
+          "FP16": {
+            "BlockSize": 256,
+            "MPerBlock": 128,
+            "NPerBlock": 128,
+            "K0PerBlock": 16,
+            "K1": 1
+          }
+        }
+      },
+      
+      "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle": {
+        "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp",
+        "description": "XDL-based backward weight with Multiple D tensor support for fused operations",
+        "instruction_set": "XDL/MFMA",
+        "version": "multiple_d",
+        "status": "active",
+        "features": [
+          "Supports additional input tensors (D tensors)",
+          "Fused element-wise operations (Bilinear, Scale)",
+          "XDL matrix core operations",
+          "CShuffle optimization"
+        ],
+        "data_types": ["FP16", "BF16", "FP32", "TF32", "FP8", "BF8"],
+        "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"],
+        "fused_operations": ["Bilinear", "Scale"],
+        "instance_header_files": [
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp",
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp"
+        ],
+        "instantiation_cpp_files": {
+          "3D_bilinear": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/*.cpp"
+          ],
+          "3D_scale": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/*.cpp"
+          ]
+        },
+        "sample_instances": {
+          "FP16_Bilinear": [
+            {
+              "BlockSize": 64,
+              "MPerBlock": 64,
+              "NPerBlock": 64,
+              "K0PerBlock": 4,
+              "K1": 8,
+              "MPerXDL": 32,
+              "NPerXDL": 32,
+              "ElementwiseOp": "Bilinear"
+            }
+          ]
+        }
+      },
+      
+      "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3": {
+        "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp",
+        "description": "WMMA-based backward weight with Multiple D tensor support",
+        "instruction_set": "WMMA (16x16 matrix operations)",
+        "version": "multiple_d_v3",
+        "status": "active",
+        "features": [
+          "WMMA 16x16 matrix operations",
+          "Supports fused Scale operation",
+          "CShuffle optimization"
+        ],
+        "data_types": ["FP16", "BF16"],
+        "specializations": ["ConvBwdWeightDefault", "ConvBwdWeightFilter1x1Stride1Pad0"],
+        "fused_operations": ["Scale"],
+        "instance_header_files": [
+          "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_scale_instance.hpp"
+        ],
+        "instantiation_cpp_files": {
+          "3D_scale": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/wmma/*.cpp"
+          ]
+        }
+      }
+    }
+  },
+  
+  "template_parameters_detailed": {
+    "XDL_parameters": {
+      "NDimSpatial": {
+        "type": "index_t",
+        "description": "Number of spatial dimensions",
+        "values": [1, 2, 3]
+      },
+      "InLayout": {
+        "type": "typename",
+        "description": "Input tensor layout",
+        "examples": ["GNHWC", "NHWGC", "NGCHW", "GNDHWC", "GNWC"]
+      },
+      "WeiLayout": {
+        "type": "typename",
+        "description": "Weight tensor layout",
+        "examples": ["GKYXC", "GKCYX", "GKZYXC", "GKXC"]
+      },
+      "OutLayout": {
+        "type": "typename",
+        "description": "Output tensor layout",
+        "examples": ["GNHWK", "NHWGK", "NGKHW", "GNDHWK", "GNWK"]
+      },
+      "InDataType": {
+        "type": "typename",
+        "description": "Input data type",
+        "values": ["F16", "BF16", "F32"]
+      },
+      "WeiDataType": {
+        "type": "typename",
+        "description": "Weight data type",
+        "values": ["F16", "BF16", "F32"]
+      },
+      "OutDataType": {
+        "type": "typename",
+        "description": "Output data type",
+        "values": ["F16", "BF16", "F32"]
+      },
+      "AccDataType": {
+        "type": "typename",
+        "description": "Accumulator data type",
+        "values": ["F32", "I32"],
+        "note": "Typically F32 for floating point, I32 for INT8"
+      },
+      "ConvolutionBackwardWeightSpecialization": {
+        "type": "enum",
+        "values": ["Default", "Filter1x1Stride1Pad0"]
+      },
+      "BlockSize": {
+        "type": "index_t",
+        "description": "Total number of threads per block",
+        "typical_values": [32, 64, 96, 128, 256]
+      },
+      "MPerBlock": {
+        "type": "index_t",
+        "description": "M dimension of GEMM tile per block",
+        "typical_values": [16, 32, 48, 64, 80, 96, 112, 128, 208, 256]
+      },
+      "NPerBlock": {
+        "type": "index_t",
+        "description": "N dimension of GEMM tile per block",
+        "typical_values": [16, 32, 48, 64, 80, 96, 112, 128, 208, 256]
+      },
+      "K0PerBlock": {
+        "type": "index_t",
+        "description": "K0 dimension blocking per block",
+        "typical_values": [4, 8, 16, 32, 64, 128]
+      },
+      "K1": {
+        "type": "index_t",
+        "description": "Vector width in K dimension",
+        "FP16_BF16": 8,
+        "FP32": 4,
+        "INT8": 16
+      },
+      "MPerXDL": {
+        "type": "index_t",
+        "description": "M dimension per XDL instruction",
+        "values": [16, 32]
+      },
+      "NPerXDL": {
+        "type": "index_t",
+        "description": "N dimension per XDL instruction",
+        "values": [16, 32]
+      },
+      "MXdlPerWave": {
+        "type": "index_t",
+        "description": "Number of XDL tiles in M dimension per wave",
+        "typical_values": [1, 2, 3, 4, 5, 6, 7, 8, 13]
+      },
+      "NXdlPerWave": {
+        "type": "index_t",
+        "description": "Number of XDL tiles in N dimension per wave",
+        "typical_values": [1, 2, 3, 4, 5, 7, 8, 13, 16]
+      },
+      "ABlockTransferThreadClusterLengths_K0_M_K1": {
+        "type": "Sequence<index_t, index_t, index_t>",
+        "description": "Thread cluster dimensions for A matrix block transfer",
+        "examples": [[4, 8, 1], [4, 16, 1], [4, 32, 1], [8, 8, 1]]
+      },
+      "ABlockTransferThreadClusterArrangeOrder": {
+        "type": "Sequence<index_t, index_t, index_t>",
+        "description": "Thread cluster arrange order for A",
+        "common_value": [2, 0, 1]
+      },
+      "ABlockTransferSrcAccessOrder": {
+        "type": "Sequence<index_t, index_t, index_t>",
+        "description": "Source access order for A block transfer",
+        "common_values": [[1, 0, 2], [2, 0, 1]]
+      },
+      "ABlockTransferSrcVectorDim": {
+        "type": "index_t",
+        "description": "Vector dimension for A source",
+        "typical_value": 1
+      },
+      "ABlockTransferSrcScalarPerVector": {
+        "type": "index_t",
+        "description": "Vector load size for A matrix source",
+        "typical_values": [1, 2, 4, 8, 16]
+      },
+      "ABlockTransferDstScalarPerVector_K1": {
+        "type": "index_t",
+        "description": "Vector store size for A matrix destination in K1 dimension",
+        "typical_values": [1, 2, 4, 8]
+      },
+      "ABlockLdsAddExtraM": {
+        "type": "bool",
+        "description": "Add extra padding in M dimension for LDS to avoid bank conflicts",
+        "typical_values": [false, true]
+      },
+      "BBlockTransferThreadClusterLengths_K0_N_K1": {
+        "type": "Sequence<index_t, index_t, index_t>",
+        "description": "Thread cluster dimensions for B matrix block transfer",
+        "examples": [[4, 8, 1], [4, 16, 1], [4, 32, 1]]
+      },
+      "BBlockTransferSrcScalarPerVector": {
+        "type": "index_t",
+        "description": "Vector load size for B matrix source",
+        "typical_values": [1, 2, 4, 8, 16]
+      },
+      "BBlockTransferDstScalarPerVector_K1": {
+        "type": "index_t",
+        "description": "Vector store size for B matrix destination in K1 dimension",
+        "typical_values": [1, 2, 4, 8]
+      },
+      "BBlockLdsAddExtraN": {
+        "type": "bool",
+        "description": "Add extra padding in N dimension for LDS to avoid bank conflicts"
+      },
+      "CShuffleMXdlPerWavePerShuffle": {
+        "type": "index_t",
+        "description": "M XDL tiles per shuffle operation",
+        "typical_value": 1
+      },
+      "CShuffleNXdlPerWavePerShuffle": {
+        "type": "index_t",
+        "description": "N XDL tiles per shuffle operation",
+        "typical_value": 1
+      },
+      "CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock": {
+        "type": "Sequence<index_t, index_t, index_t, index_t>",
+        "description": "Thread cluster for C matrix output",
+        "examples": [[1, 8, 1, 8], [1, 16, 1, 16], [1, 32, 1, 8]]
+      },
+      "CBlockTransferScalarPerVector_NWaveNPerXdl": {
+        "type": "index_t",
+        "description": "Vector store size for C matrix output",
+        "typical_values": [1, 2, 4, 8]
+      },
+      "BlkGemmPipeSched": {
+        "type": "BlockGemmPipelineScheduler",
+        "description": "Block GEMM pipeline scheduler",
+        "values": ["Intrawave", "Interwave"]
+      },
+      "BlkGemmPipelineVer": {
+        "type": "BlockGemmPipelineVersion",
+        "description": "Block GEMM pipeline version",
+        "values": ["v1", "v2", "v3", "v4", "v5"]
+      },
+      "ComputeTypeA": {
+        "type": "typename",
+        "description": "Compute data type for A matrix",
+        "examples": ["F16", "BF16", "F32", "TF32", "BF8", "F8"],
+        "note": "Can differ from input type, e.g., TF32 for FP32 input, BF8/F8 for FP16"
+      },
+      "ComputeTypeB": {
+        "type": "typename",
+        "description": "Compute data type for B matrix",
+        "examples": ["F16", "BF16", "F32", "TF32", "BF8", "F8"]
+      }
+    },
+    
+    "WMMA_parameters": {
+      "BlockSize": {
+        "type": "index_t",
+        "description": "Total number of threads per block",
+        "typical_values": [32, 64, 96, 128, 256]
+      },
+      "MPerBlock": {
+        "type": "index_t",
+        "description": "M dimension per block",
+        "typical_values": [16, 32, 48, 64, 96, 128, 256]
+      },
+      "NPerBlock": {
+        "type": "index_t",
+        "description": "N dimension per block",
+        "typical_values": [16, 32, 64, 128, 256]
+      },
+      "K0PerBlock": {
+        "type": "index_t",
+        "description": "K0 dimension per block",
+        "typical_values": [4, 8, 32, 64, 128]
+      },
+      "K1": {
+        "type": "index_t",
+        "description": "Vector width",
+        "FP16_BF16": 8,
+        "INT8": 16
+      },
+      "MPerWmma": {
+        "type": "index_t",
+        "description": "M dimension per WMMA instruction",
+        "value": 16,
+        "note": "Fixed at 16 for WMMA"
+      },
+      "NPerWmma": {
+        "type": "index_t",
+        "description": "N dimension per WMMA instruction",
+        "value": 16,
+        "note": "Fixed at 16 for WMMA"
+      },
+      "MRepeat": {
+        "type": "index_t",
+        "description": "Number of WMMA operations in M dimension",
+        "typical_values": [1, 2, 3, 4, 6, 8]
+      },
+      "NRepeat": {
+        "type": "index_t",
+        "description": "Number of WMMA operations in N dimension",
+        "typical_values": [1, 2, 4, 8]
+      }
+    },
+    
+    "TwoStage_specific_parameters": {
+      "NumGroupsToMerge": {
+        "type": "index_t",
+        "description": "Number of groups to merge in two-stage algorithm for better performance",
+        "typical_values": [1, 2, 4, 8]
+      }
+    },
+    
+    "DirectLoad_specific_parameters": {
+      "M1PerThread": {
+        "type": "index_t",
+        "description": "M1 dimension per thread in direct load pattern"
+      },
+      "N1PerThread": {
+        "type": "index_t",
+        "description": "N1 dimension per thread in direct load pattern"
+      }
+    }
+  },
+  
+  "file_references": {
+    "algorithm_implementation_headers": {
+      "path": "include/ck/tensor_operation/gpu/device/impl/",
+      "backward_data": [
+        "device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp",
+        "device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp"
+      ],
+      "backward_weight": [
+        "device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp",
+        "device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp",
+        "device_grouped_conv_bwd_weight_xdl_cshuffle.hpp",
+        "device_grouped_conv_bwd_weight_wmma_cshuffle.hpp",
+        "device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp",
+        "device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp",
+        "device_grouped_conv_bwd_weight_dl.hpp",
+        "device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp",
+        "device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp"
+      ]
+    },
+    
+    "instance_template_headers": {
+      "path": "library/include/ck/library/tensor_operation_instance/gpu/",
+      "backward_data": {
+        "directory": "grouped_conv_bwd_data/",
+        "files": [
+          "device_grouped_conv_bwd_data_xdl_instance.hpp",
+          "device_grouped_conv_bwd_data_wmma_f16_instance.hpp",
+          "device_grouped_conv_bwd_data_wmma_i8_instance.hpp",
+          "device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp",
+          "device_grouped_conv_bwd_data_xdl_scale_instance.hpp",
+          "device_grouped_conv_bwd_data_transpose_xdl_instance.hpp"
+        ]
+      },
+      "backward_weight": {
+        "directory": "grouped_conv_bwd_weight/",
+        "files": [
+          "device_grouped_conv_bwd_weight_v3_xdl_instance.hpp",
+          "device_grouped_conv_bwd_weight_v3_wmma_instance.hpp",
+          "device_grouped_conv_bwd_weight_xdl_instance.hpp",
+          "device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp",
+          "device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp",
+          "device_grouped_conv_bwd_weight_dl_instance.hpp",
+          "device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp",
+          "device_grouped_conv_bwd_weight_xdl_scale_instance.hpp",
+          "device_grouped_conv_bwd_weight_wmma_scale_instance.hpp",
+          "device_grouped_conv_bwd_weight_wmma_bilinear_instance.hpp"
+        ]
+      }
+    },
+    
+    "instantiation_cpp_files": {
+      "path": "library/src/tensor_operation_instance/gpu/",
+      "backward_data": {
+        "1D": "grouped_conv1d_bwd_data/",
+        "2D": "grouped_conv2d_bwd_data/{wmma,xdl}/",
+        "3D": "grouped_conv3d_bwd_data/{wmma,xdl}/"
+      },
+      "backward_weight": {
+        "1D": "grouped_conv1d_bwd_weight/{dl,xdl}/",
+        "2D": "grouped_conv2d_bwd_weight/{dl,wmma,xdl}/",
+        "3D": [
+          "grouped_conv3d_bwd_weight/{dl,wmma,xdl}/",
+          "grouped_conv3d_bwd_weight_bilinear/",
+          "grouped_conv3d_bwd_weight_scale/"
+        ]
+      }
+    }
+  },
+  
+  "layout_configurations": {
+    "backward_data_2D": {
+      "supported_combinations": [
+        {
+          "output_layout": "GNHWK",
+          "weight_layout": "GKYXC",
+          "input_layout": "GNHWC",
+          "description": "Group-first NHWC layout"
+        },
+        {
+          "output_layout": "NHWGK",
+          "weight_layout": "GKYXC",
+          "input_layout": "NHWGC",
+          "description": "NHWC with group dimension at end"
+        },
+        {
+          "output_layout": "NGKHW",
+          "weight_layout": "GKCYX",
+          "input_layout": "NGCHW",
+          "description": "NCHW variant with grouped C"
+        },
+        {
+          "output_layout": "NGKHW",
+          "weight_layout": "GKYXC",
+          "input_layout": "NGCHW",
+          "description": "Mixed layout NCHW/filter-C-last"
+        }
+      ]
+    },
+    "backward_weight_2D": {
+      "supported_combinations": [
+        {
+          "input_layout": "NHWGC",
+          "weight_layout": "GKYXC",
+          "output_layout": "NHWGK",
+          "algorithms": ["All except Wmma_CShuffle (3D only)"]
+        },
+        {
+          "input_layout": "GNHWC",
+          "weight_layout": "GKYXC",
+          "output_layout": "GNHWK",
+          "algorithms": ["XDL variants", "DL"]
+        },
+        {
+          "input_layout": "NGCHW",
+          "weight_layout": "GKCYX",
+          "output_layout": "NGKHW",
+          "algorithms": ["XDL variants (with transpose support)"]
+        },
+        {
+          "input_layout": "NGCHW",
+          "weight_layout": "GKYXC",
+          "output_layout": "NGKHW",
+          "algorithms": ["XDL variants (with transpose support)"]
+        }
+      ]
+    },
+    "backward_weight_3D": {
+      "supported_combinations": [
+        {
+          "input_layout": "GNDHWC",
+          "weight_layout": "GKZYXC",
+          "output_layout": "GNDHWK",
+          "algorithms": ["All"]
+        },
+        {
+          "input_layout": "NDHWGC",
+          "weight_layout": "GKZYXC",
+          "output_layout": "NDHWGK",
+          "algorithms": ["All including Wmma_CShuffle"]
+        }
+      ]
+    }
+  },
+  
+  "statistics": {
+    "total_algorithm_implementations": {
+      "backward_data": 2,
+      "backward_weight": 9,
+      "total": 11
+    },
+    "estimated_instance_counts": {
+      "backward_data": {
+        "XDL_variants": "~200+ instances (FP16, BF16, FP32, TF32, with various optimizations)",
+        "WMMA_variants": "~30 instances (FP16: 14, INT8: 13)"
+      },
+      "backward_weight": {
+        "Xdl_CShuffleV3": "~20 instances (FP16, BF16, FP32 with pipeline variants)",
+        "Wmma_CShuffleV3": "~20 instances (FP16: 9, BF16: 10)",
+        "Xdl_CShuffle_v1": "~50+ instances (FP16, BF16, FP32, TF32, FP8/BF8)",
+        "Wmma_CShuffle_v1": "Limited (3D only)",
+        "TwoStage_Xdl": "~50+ instances (FP16, BF16 with pipev1/v2/v5, regular and irregular)",
+        "TwoStage_Wmma": "~2 instances (FP16, BF16 base configs)",
+        "DL": "~6 instances (FP16, BF16/FP32, FP32 for 1D/2D/3D)",
+        "MultipleD_Xdl": "~30 instances (Bilinear and Scale variants)",
+        "MultipleD_Wmma": "~10 instances (Scale variant)"
+      },
+      "total_estimated": "400-500+ pre-compiled instances"
+    },
+    
+    "data_type_support_matrix": {
+      "FP16": {
+        "backward_data": {
+          "XDL": true,
+          "WMMA": true
+        },
+        "backward_weight": {
+          "Xdl_V3": true,
+          "Wmma_V3": true,
+          "Xdl_V1": true,
+          "Wmma_V1_3D": true,
+          "TwoStage_XDL": true,
+          "TwoStage_WMMA": true,
+          "DL": true,
+          "MultipleD_XDL": true,
+          "MultipleD_WMMA": true
+        }
+      },
+      "BF16": {
+        "backward_data": {
+          "XDL": true,
+          "WMMA": false
+        },
+        "backward_weight": {
+          "Xdl_V3": true,
+          "Wmma_V3": true,
+          "Xdl_V1": true,
+          "Wmma_V1_3D": true,
+          "TwoStage_XDL": true,
+          "TwoStage_WMMA": true,
+          "DL": "mixed (BF16/FP32)",
+          "MultipleD_XDL": "mixed (BF16/FP32)",
+          "MultipleD_WMMA": "mixed (BF16/FP32)"
+        }
+      },
+      "FP32": {
+        "backward_data": {
+          "XDL": true,
+          "WMMA": false
+        },
+        "backward_weight": {
+          "Xdl_V3": true,
+          "Wmma_V3": false,
+          "Xdl_V1": true,
+          "Wmma_V1_3D": true,
+          "TwoStage_XDL": false,
+          "TwoStage_WMMA": false,
+          "DL": true,
+          "MultipleD_XDL": true,
+          "MultipleD_WMMA": false
+        }
+      },
+      "TF32": {
+        "backward_data": {
+          "XDL": "compute_type",
+          "note": "TF32 is compute type, input/output is FP32"
+        },
+        "backward_weight": {
+          "Xdl_V1": "compute_type",
+          "MultipleD_XDL": "compute_type"
+        }
+      },
+      "INT8": {
+        "backward_data": {
+          "WMMA": true
+        },
+        "backward_weight": {
+          "note": "No INT8 backward weight support"
+        }
+      },
+      "FP8_BF8": {
+        "backward_data": {
+          "XDL": "compute_type (newer GPUs)"
+        },
+        "backward_weight": {
+          "Xdl_V1": "compute_type",
+          "MultipleD_XDL": "compute_type"
+        }
+      }
+    }
+  },
+  
+  "instance_examples": {
+    "backward_data_XDL_FP16": {
+      "algorithm": "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1",
+      "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp",
+      "cpp_file": "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp",
+      "instances": [
+        {
+          "BlockSize": 64,
+          "MPerBlock": 64,
+          "NPerBlock": 64,
+          "K0PerBlock": 32,
+          "K1": 8,
+          "MPerXDL": 32,
+          "NPerXDL": 32,
+          "MXdlPerWave": 2,
+          "NXdlPerWave": 2,
+          "specialization": "ConvBwdDataDefault"
+        },
+        {
+          "BlockSize": 256,
+          "MPerBlock": 128,
+          "NPerBlock": 256,
+          "K0PerBlock": 32,
+          "K1": 8,
+          "MPerXDL": 32,
+          "NPerXDL": 32,
+          "MXdlPerWave": 2,
+          "NXdlPerWave": 4
+        }
+      ]
+    },
+    
+    "backward_data_WMMA_FP16": {
+      "algorithm": "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle",
+      "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_f16_instance.hpp",
+      "cpp_file": "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp",
+      "instances": [
+        {
+          "BlockSize": 128,
+          "MPerBlock": 64,
+          "NPerBlock": 64,
+          "K0PerBlock": 4,
+          "K1": 8,
+          "MPerWmma": 16,
+          "NPerWmma": 16,
+          "MRepeat": 2,
+          "NRepeat": 2
+        },
+        {
+          "BlockSize": 256,
+          "MPerBlock": 128,
+          "NPerBlock": 256,
+          "K0PerBlock": 8,
+          "K1": 8,
+          "MRepeat": 4,
+          "NRepeat": 4
+        }
+      ]
+    },
+    
+    "backward_weight_XDL_V3_FP16": {
+      "algorithm": "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3",
+      "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp",
+      "cpp_file": "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_v3_xdl_nhwgc_gkyxc_nhwgk_f16_*.cpp",
+      "instances": [
+        {
+          "BlockSize": 64,
+          "MPerBlock": 32,
+          "NPerBlock": 32,
+          "K0PerBlock": 32,
+          "K1": 8,
+          "MPerXDL": 32,
+          "NPerXDL": 32,
+          "MXdlPerWave": 1,
+          "NXdlPerWave": 1,
+          "pipeline_scheduler": "Intrawave",
+          "pipeline_version": "v1"
+        },
+        {
+          "BlockSize": 64,
+          "MPerBlock": 64,
+          "NPerBlock": 80,
+          "K0PerBlock": 32,
+          "K1": 8,
+          "MPerXDL": 16,
+          "NPerXDL": 16,
+          "MXdlPerWave": 4,
+          "NXdlPerWave": 5,
+          "note": "Irregular NPerBlock=80"
+        }
+      ]
+    },
+    
+    "backward_weight_WMMA_V3_FP16": {
+      "algorithm": "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3",
+      "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp",
+      "cpp_file": "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp",
+      "instances": [
+        {
+          "BlockSize": 64,
+          "MPerBlock": 32,
+          "NPerBlock": 32,
+          "K0PerBlock": 32,
+          "K1": 8,
+          "MPerWmma": 16,
+          "NPerWmma": 16,
+          "MRepeat": 2,
+          "NRepeat": 1
+        },
+        {
+          "BlockSize": 128,
+          "MPerBlock": 96,
+          "NPerBlock": 128,
+          "K0PerBlock": 64,
+          "K1": 8,
+          "MRepeat": 6,
+          "NRepeat": 2,
+          "note": "Irregular MPerBlock=96"
+        }
+      ]
+    },
+    
+    "backward_weight_XDL_V1_FP16": {
+      "algorithm": "DeviceGroupedConvBwdWeight_Xdl_CShuffle",
+      "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp",
+      "cpp_files": [
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_*.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp"
+      ],
+      "instances": [
+        {
+          "BlockSize": 64,
+          "MPerBlock": 64,
+          "NPerBlock": 64,
+          "K0PerBlock": 4,
+          "K1": 8,
+          "MPerXDL": 32,
+          "NPerXDL": 32,
+          "MXdlPerWave": 2,
+          "NXdlPerWave": 2
+        }
+      ]
+    },
+    
+    "backward_weight_TwoStage_XDL_FP16": {
+      "algorithm": "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle",
+      "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp",
+      "cpp_files": [
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instance.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp"
+      ],
+      "instances": [
+        {
+          "BlockSize": 64,
+          "MPerBlock": 16,
+          "NPerBlock": 16,
+          "K0PerBlock": 32,
+          "K1": 8,
+          "MPerXDL": 16,
+          "NPerXDL": 16,
+          "MXdlPerWave": 1,
+          "NXdlPerWave": 1,
+          "NumGroupsToMerge": 1,
+          "pipeline_version": "v1"
+        },
+        {
+          "BlockSize": 64,
+          "MPerBlock": 64,
+          "NPerBlock": 80,
+          "K0PerBlock": 32,
+          "K1": 8,
+          "MPerXDL": 16,
+          "NPerXDL": 16,
+          "MXdlPerWave": 4,
+          "NXdlPerWave": 5,
+          "NumGroupsToMerge": 1,
+          "note": "Irregular NPerBlock=80"
+        }
+      ]
+    },
+    
+    "backward_weight_DL_FP16": {
+      "algorithm": "DeviceGroupedConvBwdWeight_DL",
+      "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp",
+      "cpp_files": [
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instance.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp"
+      ],
+      "instances": [
+        {
+          "BlockSize": 256,
+          "MPerBlock": 128,
+          "NPerBlock": 128,
+          "K0PerBlock": 16,
+          "K1": 1,
+          "M1PerThread": 4,
+          "N1PerThread": 4
+        }
+      ]
+    },
+    
+    "backward_weight_MultipleD_XDL_Bilinear_FP16": {
+      "algorithm": "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle",
+      "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp",
+      "cpp_files": [
+            "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/*.cpp"
+      ],
+      "instances": [
+        {
+          "BlockSize": 64,
+          "MPerBlock": 64,
+          "NPerBlock": 64,
+          "K0PerBlock": 4,
+          "K1": 8,
+          "MPerXDL": 32,
+          "NPerXDL": 32,
+          "MXdlPerWave": 2,
+          "NXdlPerWave": 2,
+          "ElementwiseOp": "Bilinear",
+          "DsTensors": "Tuple<BLayout>"
+        }
+      ]
+    }
+  },
+  
+  "key_optimizations": {
+    "CShuffle": {
+      "description": "Cross-lane shuffle for efficient data redistribution",
+      "benefit": "Reduces shared memory bank conflicts and improves data reuse"
+    },
+    "Split-K": {
+      "description": "Parallelizes reduction dimension",
+      "algorithms": ["Xdl_CShuffleV3"],
+      "auto_deduction": true,
+      "benefit": "Better occupancy for large K dimensions"
+    },
+    "Pipeline_Versions": {
+      "v1": "Basic pipeline",
+      "v2": "Enhanced prefetching with tail number support (supports tails 1-7)",
+      "v3": "Further optimizations",
+      "v4": "Dual LDS buffer support for improved throughput",
+      "v5": "Advanced prefetching for two-stage algorithms"
+    },
+    "Pipeline_Schedulers": {
+      "Intrawave": "Schedule within wave for lower latency",
+      "Interwave": "Schedule across waves for better occupancy"
+    },
+    "LDS_Padding": {
+      "description": "Extra padding to avoid bank conflicts",
+      "parameters": ["ABlockLdsAddExtraM", "BBlockLdsAddExtraN"]
+    },
+    "Optimized_Loads": {
+      "description": "Specialized instances with optimized memory access patterns",
+      "variants": ["optimized_loads", "16_16 (16x16 XDL)", "vec_transpose"]
+    },
+    "Two_Stage": {
+      "description": "Splits computation into two stages with intermediate workspace",
+      "benefit": "Reduces memory footprint for very large convolutions",
+      "workspace_requirement": true
+    },
+    "Group_Merging": {
+      "description": "Merges multiple groups for better performance",
+      "parameter": "NumGroupsToMerge",
+      "algorithms": ["TwoStage variants"]
+    }
+  },
+  
+  "usage_guidelines": {
+    "recommended_algorithms": {
+      "backward_data": {
+        "modern_GPUs": "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 with FP16/BF16",
+        "varied_sizes": "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle",
+        "int8_quantization": "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle"
+      },
+      "backward_weight": {
+        "new_code_XDL": "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 (latest optimizations)",
+        "new_code_WMMA": "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3",
+        "large_convolutions": "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle",
+        "NCHW_layouts": "DeviceGroupedConvBwdWeight_Xdl_CShuffle (v1, has transpose support)",
+        "fused_operations": "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle (Bilinear/Scale)",
+        "3D_WMMA_only": "DeviceGroupedConvBwdWeight_Wmma_CShuffle (3D specific)",
+        "low_memory": "DeviceGroupedConvBwdWeight_DL"
+      }
+    },
+    
+    "instance_selection": {
+      "guidelines": [
+        "Match block sizes to problem dimensions for best occupancy",
+        "Larger blocks (256) generally better for large convolutions",
+        "Smaller blocks (64, 128) for small convolutions or limited resources",
+        "Consider occupancy vs register pressure tradeoffs",
+        "Use specialized 1x1s1p0 instances when applicable",
+        "Test multiple instances to find optimal configuration",
+        "V3 variants use latest optimizations and are recommended",
+        "Two-stage variants use intermediate workspace but save memory"
+      ]
+    },
+    
+    "performance_tips": [
+      "Enable Split-K auto-deduction for V3 XDL (set split_k=-1)",
+      "Use FP16/BF16 on modern AMD GPUs for best performance",
+      "TF32 compute can accelerate FP32 convolutions on supported hardware",
+      "For very large convolutions, consider two-stage algorithms",
+      "For fused operations (e.g., gradient scaling), use MultipleD variants",
+      "NCHW layouts require transpose support (use v1 XDL variants)",
+      "INT8 backward data is only available with WMMA"
+    ]
+  }
+}
diff --git a/experimental/builder/instances/grouped_conv_bwd_algorithms_summary.md b/experimental/builder/instances/grouped_conv_bwd_algorithms_summary.md
new file mode 100644
index 0000000000..8ada5248ec
--- /dev/null
+++ b/experimental/builder/instances/grouped_conv_bwd_algorithms_summary.md
@@ -0,0 +1,460 @@
+# Grouped Convolution Backward Algorithms Summary
+
+This document provides a comprehensive overview of the backward convolution algorithms exposed by the Composable Kernel library for **grouped convolutions**.
+
+## Overview
+
+The library provides optimized GPU kernels for two types of backward convolution operations:
+1. **Backward Data** (gradient with respect to input)
+2. **Backward Weight** (gradient with respect to weights)
+
+All algorithms are part of the static library and have pre-compiled instances.
+
+## 1. Backward Data Convolution Algorithms
+
+### 1.1 DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
+
+**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp`
+
+**Description:** XDL (Matrix Core) based backward data convolution with CShuffle optimization.
+
+**Key Features:**
+- Uses AMD Matrix Core Instructions (XDL/MFMA)
+- CShuffle for efficient data movement
+- Supports multiple spatial dimensions (1D, 2D, 3D)
+- Multiple data types: FP16, BF16, FP32, TF32, FP8, BF8
+- Two specializations:
+  - `ConvBwdDataDefault`: General convolution
+  - `ConvBwdDataFilter1x1Stride1Pad0`: Optimized for 1x1 filters with stride 1 and no padding
+
+**Instance Files:**
+- `device_grouped_conv_bwd_data_xdl_instance.hpp` - Main XDL instances (FP16, BF16, FP32, TF32)
+- `device_grouped_conv_bwd_data_xdl_bilinear_instance.hpp` - Bilinear variants
+- `device_grouped_conv_bwd_data_xdl_scale_instance.hpp` - Scale variants
+- `device_grouped_conv_bwd_data_transpose_xdl_instance.hpp` - Transpose variants
+
+**Instantiation Sources:**
+- `grouped_conv2d_bwd_data/xdl/*.cpp` - 2D convolution instances
+- `grouped_conv3d_bwd_data/xdl/*.cpp` - 3D convolution instances
+
+### 1.2 DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
+
+**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp`
+
+**Description:** WMMA (Wave Matrix Multiply Accumulate) based backward data convolution.
+
+**Key Features:**
+- Uses WMMA instructions (16x16 matrix operations)
+- More flexible for different block sizes
+- Supports FP16 and INT8 data types
+- Optimized for specific GPU architectures
+
+**Instance Files:**
+- `device_grouped_conv_bwd_data_wmma_f16_instance.hpp` - FP16 WMMA instances
+- `device_grouped_conv_bwd_data_wmma_i8_instance.hpp` - INT8 WMMA instances
+
+**Instantiation Sources:**
+- `grouped_conv2d_bwd_data/wmma/*.cpp` - 2D convolution instances
+- `grouped_conv3d_bwd_data/wmma/*.cpp` - 3D convolution instances
+
+## 2. Backward Weight Convolution Algorithms
+
+### 2.1 DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
+
+**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp`
+
+**Description:** Latest XDL-based backward weight convolution, version 3 with advanced optimizations.
+
+**Status:** ✅ **Active - Part of static library** (Recommended for new code)
+
+**Key Features:**
+- Latest XDL implementation with CShuffle
+- Support for split-K optimization with auto-deduction
+- Multiple pipeline versions (v1, v2, v3, v4)
+- Block GEMM pipeline schedulers (Intrawave, Interwave)
+- Dual LDS buffer support (v4 pipeline)
+- Data types: FP16, BF16, FP32, TF32
+- Two specializations:
+  - `ConvBwdWeightDefault`: General convolution
+  - `ConvBwdWeightFilter1x1Stride1Pad0`: Optimized for 1x1 filters
+
+**Instance Files:**
+- `device_grouped_conv_bwd_weight_v3_xdl_instance.hpp`
+
+**Instantiation Sources (2D):**
+- `grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_v3_xdl_gnhwc_gkyxc_gnhwk_*.cpp`
+- `grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_v3_xdl_nhwgc_gkyxc_nhwgk_*.cpp`
+
+### 2.2 DeviceGroupedConvBwdWeight_Wmma_CShuffleV3
+
+**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp`
+
+**Description:** Latest WMMA-based backward weight convolution, version 3.
+
+**Status:** ✅ **Active - Part of static library** (Recommended for WMMA)
+
+**Key Features:**
+- WMMA 16x16 matrix operations
+- CShuffle optimization
+- Pipeline schedulers and versions
+- Data types: FP16, BF16
+
+**Instance Files:**
+- `device_grouped_conv_bwd_weight_v3_wmma_instance.hpp`
+
+**Instantiation Sources (2D):**
+- `grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_*.cpp`
+
+### 2.3 DeviceGroupedConvBwdWeight_Xdl_CShuffle
+
+**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp`
+
+**Description:** Original XDL-based backward weight convolution (version 1).
+
+**Status:** ✅ **Active - Part of static library** (Legacy, but still maintained)
+
+**Key Features:**
+- XDL/MFMA matrix core operations
+- CShuffle optimization
+- Data types: FP16, BF16, FP32, TF32, FP8/BF8
+- Supports transpose operations for NCHW layouts
+- Two specializations (Default, Filter1x1Stride1Pad0)
+
+**Instance Files:**
+- `device_grouped_conv_bwd_weight_xdl_instance.hpp`
+
+**Instantiation Sources (2D):**
+- `grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_*.cpp`
+- `grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_*.cpp`
+- `grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkcyx_ngkhw_*.cpp`
+- `grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_*.cpp`
+
+**Instantiation Sources (1D & 3D):**
+- `grouped_conv1d_bwd_weight/xdl/*.cpp`
+- `grouped_conv3d_bwd_weight/xdl/*.cpp`
+
+### 2.4 DeviceGroupedConvBwdWeight_Wmma_CShuffle
+
+**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp`
+
+**Description:** Original WMMA-based backward weight convolution for 3D convolutions only.
+
+**Status:** ✅ **Active - Part of static library** (3D-specific)
+
+**Key Features:**
+- WMMA 16x16 matrix operations
+- CShuffle optimization
+- **Specialized for 3D convolutions only**
+- Data types: FP16, BF16, FP32
+- Two specializations (Default, Filter1x1Stride1Pad0)
+
+**Supported Layouts (3D only):**
+- NDHWGC/GKZYXC/NDHWGK
+- GNDHWC/GKZYXC/GNDHWK
+
+**Note:** This algorithm is specific to 3D convolutions and uses different template parameter structure than other WMMA variants.
+
+### 2.5 DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle
+
+**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp`
+
+**Description:** Two-stage XDL-based backward weight for large convolutions with memory constraints.
+
+**Status:** ✅ **Active - Part of static library**
+
+**Key Features:**
+- Two-stage computation with intermediate workspace
+- Better memory efficiency for large problems
+- XDL matrix core operations
+- Multiple pipeline versions (v1, v2, v5)
+- Group merging optimization (NumGroupsToMerge parameter)
+- Data types: FP16, BF16
+- Supports irregular MPerBlock/NPerBlock configurations
+
+**Instance Files:**
+- `device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp`
+
+**Instantiation Sources (2D):**
+- `grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_*_pipev*.cpp`
+- `grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_*_pipev*.cpp`
+- `grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_*_pipev*.cpp`
+
+**Instantiation Sources (3D):**
+- `grouped_conv3d_bwd_weight/xdl/*.cpp`
+
+### 2.6 DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3
+
+**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp`
+
+**Description:** Two-stage WMMA-based backward weight convolution.
+
+**Status:** ✅ **Active - Part of static library**
+
+**Key Features:**
+- Two-stage computation
+- WMMA 16x16 matrix operations
+- Pipeline versions
+- Group merging optimization
+- Data types: FP16, BF16
+
+**Instance Files:**
+- `device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp`
+
+**Instantiation Sources (2D):**
+- `grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_wmma_nhwgc_gkyxc_nhwgk_*_pipev*.cpp`
+
+**Instantiation Sources (3D):**
+- `grouped_conv3d_bwd_weight/wmma/*.cpp`
+
+### 2.7 DeviceGroupedConvBwdWeight_DL
+
+**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp`
+
+**Description:** Direct Load variant using different memory access pattern.
+
+**Status:** ✅ **Active - Part of static library**
+
+**Key Features:**
+- Direct load memory access pattern (no shared memory for A/B)
+- Suitable for specific problem sizes
+- Supports 1D, 2D, and 3D convolutions
+- Data types: FP16, BF16/FP32 (mixed precision), FP32
+- Two specializations (Default, Filter1x1Stride1Pad0)
+
+**Instance Files:**
+- `device_grouped_conv_bwd_weight_dl_instance.hpp`
+
+**Instantiation Sources:**
+- `grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_*.cpp`
+- `grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_*.cpp`
+- `grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_*.cpp`
+
+### 2.8 DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
+
+**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp`
+
+**Description:** XDL-based backward weight with Multiple D tensor support for fused operations.
+
+**Status:** ✅ **Active - Part of static library**
+
+**Key Features:**
+- Supports additional input tensors (D tensors) for fused operations
+- Fused element-wise operations (Bilinear, Scale)
+- XDL matrix core operations
+- Data types: FP16, BF16, FP32, TF32, FP8/BF8
+- Two specializations (Default, Filter1x1Stride1Pad0)
+
+**Instance Files:**
+- `device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp` - Bilinear fusion
+- `device_grouped_conv_bwd_weight_xdl_scale_instance.hpp` - Scale fusion
+
+**Instantiation Sources:**
+- `grouped_conv3d_bwd_weight_bilinear/*.cpp` - 3D Bilinear variants
+- `grouped_conv3d_bwd_weight_scale/*.cpp` - 3D Scale variants
+
+### 2.9 DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3
+
+**Implementation File:** `include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp`
+
+**Description:** WMMA-based backward weight with Multiple D tensor support for fused operations.
+
+**Status:** ✅ **Active - Part of static library**
+
+**Key Features:**
+- WMMA 16x16 matrix operations
+- Supports fused Scale operation
+- Data types: FP16, BF16
+
+**Instance Files:**
+- `device_grouped_conv_bwd_weight_wmma_scale_instance.hpp`
+
+**Instantiation Sources:**
+- `grouped_conv3d_bwd_weight_scale/*.cpp` - 3D Scale variants
+
+## 3. Algorithm Comparison
+
+| Algorithm | Version | Instruction Set | Data Types | Fused Ops | Split-K | Dimensions | Status |
+|-----------|---------|----------------|------------|-----------|---------|------------|--------|
+| **Backward Data** | | | | | | | |
+| DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 | v1 | XDL/MFMA | FP16, BF16, FP32, TF32, FP8, BF8 | Yes | No | 1D, 2D, 3D | ✅ Active |
+| DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle | - | WMMA | FP16, INT8 | No | No | 1D, 2D, 3D | ✅ Active |
+| **Backward Weight** | | | | | | | |
+| DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 | v3 | XDL/MFMA | FP16, BF16, FP32, TF32 | No | Yes (auto) | 1D, 2D, 3D | ✅ Active (Recommended) |
+| DeviceGroupedConvBwdWeight_Wmma_CShuffleV3 | v3 | WMMA | FP16, BF16 | No | No | 1D, 2D, 3D | ✅ Active (Recommended) |
+| DeviceGroupedConvBwdWeight_Xdl_CShuffle | v1 | XDL/MFMA | FP16, BF16, FP32, TF32, FP8, BF8 | No | No | 1D, 2D, 3D | ✅ Active (Legacy) |
+| DeviceGroupedConvBwdWeight_Wmma_CShuffle | v1 | WMMA | FP16, BF16, FP32 | No | No | 3D only | ✅ Active (3D-specific) |
+| DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle | Two-stage | XDL/MFMA | FP16, BF16 | No | No | 1D, 2D, 3D | ✅ Active |
+| DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3 | Two-stage | WMMA | FP16, BF16 | No | No | 1D, 2D, 3D | ✅ Active |
+| DeviceGroupedConvBwdWeight_DL | - | Direct Load | FP16, BF16, FP32 | No | No | 1D, 2D, 3D | ✅ Active |
+| DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle | - | XDL/MFMA | FP16, BF16, FP32, TF32, FP8, BF8 | Yes | No | 1D, 2D, 3D | ✅ Active |
+| DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3 | v3 | WMMA | FP16, BF16 | Yes (Scale) | No | 3D | ✅ Active |
+
+## 4. Supported Configurations
+
+### Data Types
+- **FP16** (half_t): 16-bit floating point
+- **BF16** (bhalf_t): Brain float 16
+- **FP32** (float): 32-bit floating point
+- **TF32** (tf32_t): TensorFloat-32 (compute type)
+- **INT8** (int8_t): 8-bit integer (WMMA bwd data only)
+- **FP8/BF8**: 8-bit floating point (compute type for newer GPUs)
+
+### Tensor Layouts
+
+**Backward Data:**
+- Input: GNHWC, NHWGC, NGCHW (2D), GNDHWC, NDHWGC (3D), GNWC (1D)
+- Weight: GKYXC, GKCYX (2D), GKZYXC (3D), GKXC (1D)
+- Output: GNHWK, NHWGK, NGKHW (2D), GNDHWK, NDHWGK (3D), GNWK (1D)
+
+**Backward Weight:**
+- Input: GNHWC, NHWGC, NGCHW (2D), GNDHWC, NDHWGC (3D), GNWC (1D)
+- Weight: GKYXC, GKCYX (2D), GKZYXC (3D), GKXC (1D)
+- Output: GNHWK, NHWGK, NGKHW (2D), GNDHWK, NDHWGK (3D), GNWK (1D)
+
+### Spatial Dimensions
+- **1D Convolution**: NDimSpatial = 1
+- **2D Convolution**: NDimSpatial = 2
+- **3D Convolution**: NDimSpatial = 3
+
+### Specializations
+1. **Default**: General purpose convolution
+2. **Filter1x1Stride1Pad0**: Optimized for 1x1 convolution with stride 1 and no padding
+
+## 5. Instance Organization
+
+Instances are organized by:
+- **Spatial dimension** (1D, 2D, 3D)
+- **Data type** (FP16, BF16, FP32, INT8, etc.)
+- **Tensor layout** combination
+- **Specialization** (Default, 1x1S1P0)
+- **Element-wise operations** (PassThrough, Bilinear, Scale)
+
+Each instance specifies detailed template parameters including:
+- Block sizes (BlockSize, MPerBlock, NPerBlock, K0PerBlock)
+- Wave/Warp configurations (MXdlPerWave, NXdlPerWave or MRepeat, NRepeat)
+- Thread cluster arrangements
+- Vector access patterns
+- LDS (Local Data Share) optimizations
+- CShuffle parameters
+
+## 6. Key Template Parameters
+
+### Common Parameters (XDL variants)
+- **BlockSize**: Total threads per block (64, 128, 256)
+- **MPerBlock, NPerBlock**: GEMM tile sizes per block
+- **K0PerBlock**: K dimension blocking
+- **K1**: Vector width in K dimension (typically 8 for FP16, 4 for FP32)
+- **MPerXDL, NPerXDL**: Matrix dimensions per XDL instruction (16x16 or 32x32)
+- **MXdlPerWave, NXdlPerWave**: Number of XDL tiles per wave
+- **Pipeline Version**: v1, v2, v3, v4, v5 (different prefetch strategies)
+- **Pipeline Scheduler**: Intrawave vs Interwave
+
+### Common Parameters (WMMA variants)
+- **BlockSize**: Total threads per block (32, 64, 96, 128, 256)
+- **MPerBlock, NPerBlock**: GEMM tile sizes
+- **K0PerBlock**: K dimension blocking  
+- **K1**: Vector width (typically 8 for FP16, 16 for INT8)
+- **MPerWmma, NPerWmma**: WMMA tile size (16x16)
+- **MRepeat, NRepeat**: Repetition factors
+
+### Two-Stage Specific Parameters
+- **NumGroupsToMerge**: Number of groups to merge for better performance
+
+## 7. Performance Considerations
+
+### Choosing the Right Algorithm
+
+**For Backward Data:**
+1. **XDL variant**: Best for modern AMD GPUs with Matrix Core support (MI100, MI200, MI300 series)
+2. **WMMA variant**: Good for varied problem sizes and broader compatibility
+3. **Use FP16/BF16** for best performance on modern hardware
+
+**For Backward Weight:**
+1. **V3 variants (XDL or WMMA)**: Recommended for new code, latest optimizations
+2. **Two-Stage variants**: Best for very large convolutions with memory constraints
+3. **V1 XDL**: Good alternative with broader layout support (including NCHW)
+4. **DL variant**: Specific use cases, no shared memory overhead
+5. **MultipleD variants**: When you need fused operations (Bilinear, Scale)
+
+### Optimization Features
+- **Split-K**: Parallelizes the reduction dimension for better occupancy (V3 XDL only, auto-deduced)
+- **CShuffle**: Optimized cross-lane shuffle for data redistribution
+- **Pipeline Versions**: Different prefetch strategies to hide memory latency
+  - v1: Basic pipeline
+  - v2: Enhanced prefetching with tail number support (1-7)
+  - v3: Further optimizations
+  - v4: Dual LDS buffer support
+  - v5: Advanced prefetching
+- **LDS Padding**: Avoid bank conflicts in shared memory
+- **Two-Stage**: Reduces memory footprint for large problems
+
+## 8. Library Structure
+
+```
+library/
+├── include/ck/library/tensor_operation_instance/gpu/
+│   ├── grouped_conv_bwd_data/
+│   │   ├── device_grouped_conv_bwd_data_xdl_instance.hpp
+│   │   ├── device_grouped_conv_bwd_data_wmma_f16_instance.hpp
+│   │   ├── device_grouped_conv_bwd_data_wmma_i8_instance.hpp
+│   │   └── ... (other variants)
+│   └── grouped_conv_bwd_weight/
+│       ├── device_grouped_conv_bwd_weight_v3_xdl_instance.hpp
+│       ├── device_grouped_conv_bwd_weight_v3_wmma_instance.hpp
+│       ├── device_grouped_conv_bwd_weight_xdl_instance.hpp
+│       ├── device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+│       ├── device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp
+│       ├── device_grouped_conv_bwd_weight_dl_instance.hpp
+│       ├── device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp
+│       ├── device_grouped_conv_bwd_weight_xdl_scale_instance.hpp
+│       ├── device_grouped_conv_bwd_weight_wmma_scale_instance.hpp
+│       └── device_grouped_conv_bwd_weight_wmma_bilinear_instance.hpp
+└── src/tensor_operation_instance/gpu/
+    ├── grouped_conv1d_bwd_weight/
+    ├── grouped_conv2d_bwd_data/
+    │   ├── wmma/ (WMMA instances for 2D)
+    │   └── xdl/  (XDL instances for 2D)
+    ├── grouped_conv2d_bwd_weight/
+    │   ├── dl/   (Direct load instances)
+    │   ├── wmma/ (WMMA instances for 2D)
+    │   └── xdl/  (XDL instances for 2D - multiple layout subdirs)
+    ├── grouped_conv3d_bwd_data/
+    │   ├── wmma/ (WMMA instances for 3D)
+    │   └── xdl/  (XDL instances for 3D)
+    └── grouped_conv3d_bwd_weight/
+        ├── dl/                              (Direct load)
+        ├── wmma/                            (WMMA)
+        ├── xdl/                             (XDL)
+        ├── grouped_conv3d_bwd_weight_bilinear/
+        └── grouped_conv3d_bwd_weight_scale/
+```
+
+## Summary
+
+The Composable Kernel library provides a comprehensive set of optimized grouped convolution backward kernels:
+
+### Backward Data Algorithms: 2
+- **XDL variant**: ~200+ instances across all data types and layouts
+- **WMMA variant**: ~30 instances for FP16 and INT8
+
+### Backward Weight Algorithms: 9 (all part of static library)
+1. **DeviceGroupedConvBwdWeight_Xdl_CShuffleV3** - Latest XDL (recommended)
+2. **DeviceGroupedConvBwdWeight_Wmma_CShuffleV3** - Latest WMMA (recommended)
+3. **DeviceGroupedConvBwdWeight_Xdl_CShuffle** - Original XDL (legacy but maintained)
+4. **DeviceGroupedConvBwdWeight_Wmma_CShuffle** - Original WMMA (3D only)
+5. **DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle** - Two-stage XDL
+6. **DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3** - Two-stage WMMA
+7. **DeviceGroupedConvBwdWeight_DL** - Direct load variant
+8. **DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle** - XDL with fused ops
+9. **DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3** - WMMA with fused ops
+
+**Total estimated instances:** 300-400+ across all algorithms, data types, layouts, and specializations
+
+**Key differentiators:**
+- V3 variants: Latest optimizations, recommended for new code
+- Two-stage variants: Better for very large convolutions
+- MultipleD variants: Support fused element-wise operations
+- DL variant: No shared memory overhead
+- Wide range of data types (FP16, BF16, FP32, INT8, FP8/BF8)
+- Various tensor layout combinations
+- Advanced optimizations (CShuffle, Split-K, Pipeline tuning)
diff --git a/experimental/builder/instances/grouped_conv_bwd_data_instances.json b/experimental/builder/instances/grouped_conv_bwd_data_instances.json
new file mode 100644
index 0000000000..366f6f5bb7
--- /dev/null
+++ b/experimental/builder/instances/grouped_conv_bwd_data_instances.json
@@ -0,0 +1,473 @@
+{
+  "description": "Complete listing of all backward data convolution instances in the Composable Kernel library",
+  "note": "All instances are pre-compiled and part of the static library",
+  
+  "algorithms": {
+    "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1": {
+      "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp",
+      "instruction_set": "XDL/MFMA",
+      "status": "active",
+      
+      "instances": {
+        "FP16_generic": {
+          "data_type": "FP16",
+          "accumulator": "FP32",
+          "cshuffle_type": "FP16",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_data_xdl_f16_generic_instances",
+          "instance": {
+            "BlockSize": 64,
+            "MPerBlock": 64,
+            "NPerBlock": 64,
+            "K0PerBlock": 32,
+            "AK1": 8,
+            "BK1": 8,
+            "MPerXDL": 32,
+            "NPerXDL": 32,
+            "MXdlPerWave": 2,
+            "NXdlPerWave": 2,
+            "ABlockTransferThreadClusterLengths_K0_M_K1": [4, 16, 1],
+            "ABlockTransferThreadClusterArrangeOrder": [1, 0, 2],
+            "ABlockTransferSrcAccessOrder": [1, 0, 2],
+            "ABlockTransferSrcVectorDim": 2,
+            "ABlockTransferSrcScalarPerVector": 1,
+            "ABlockTransferDstScalarPerVector_K1": 8,
+            "ABlockLdsAddExtraM": 1,
+            "BBlockTransferThreadClusterLengths_K0_N_K1": [4, 8, 1],
+            "BBlockTransferThreadClusterArrangeOrder": [0, 2, 1],
+            "BBlockTransferSrcAccessOrder": [0, 2, 1],
+            "BBlockTransferSrcVectorDim": 1,
+            "BBlockTransferSrcScalarPerVector": 1,
+            "BBlockTransferDstScalarPerVector_K1": 8,
+            "BBlockLdsAddExtraN": 1,
+            "CShuffleMXdlPerWavePerShuffle": 1,
+            "CShuffleNXdlPerWavePerShuffle": 1,
+            "CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock": [1, 16, 1, 4],
+            "CDEBlockTransferScalarPerVector_NPerBlock": 1,
+            "specialization": "ConvBwdDataDefault or Filter1x1Stride1Pad0"
+          }
+        },
+        
+        "FP16_16x16": {
+          "data_type": "FP16",
+          "xdl_size": "16x16",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_data_xdl_f16_16_16_instances",
+          "instances": [
+            {
+              "BlockSize": 64,
+              "MPerBlock": 16,
+              "NPerBlock": 64,
+              "K0PerBlock": 32,
+              "AK1": 8,
+              "BK1": 8,
+              "MPerXDL": 16,
+              "NPerXDL": 16,
+              "MXdlPerWave": 1,
+              "NXdlPerWave": 4,
+              "ABlockTransferSrcScalarPerVector": 8,
+              "BBlockTransferSrcScalarPerVector": 8,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 4
+            },
+            {
+              "BlockSize": 64,
+              "MPerBlock": 16,
+              "NPerBlock": 64,
+              "K0PerBlock": 32,
+              "MPerXDL": 16,
+              "NPerXDL": 16,
+              "MXdlPerWave": 1,
+              "NXdlPerWave": 4,
+              "ABlockTransferSrcScalarPerVector": 8,
+              "BBlockTransferSrcScalarPerVector": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 1
+            },
+            {
+              "BlockSize": 64,
+              "MPerBlock": 16,
+              "NPerBlock": 64,
+              "K0PerBlock": 32,
+              "MXdlPerWave": 1,
+              "NXdlPerWave": 4,
+              "ABlockTransferSrcScalarPerVector": 1,
+              "BBlockTransferSrcScalarPerVector": 8,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 4
+            },
+            {
+              "BlockSize": 64,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 32,
+              "MXdlPerWave": 4,
+              "NXdlPerWave": 1,
+              "ABlockTransferSrcScalarPerVector": 8,
+              "BBlockTransferSrcScalarPerVector": 8,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 4
+            },
+            {
+              "BlockSize": 64,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 32,
+              "MXdlPerWave": 4,
+              "NXdlPerWave": 1,
+              "ABlockTransferSrcScalarPerVector": 8,
+              "BBlockTransferSrcScalarPerVector": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 1
+            },
+            {
+              "BlockSize": 64,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 32,
+              "MXdlPerWave": 4,
+              "NXdlPerWave": 1,
+              "ABlockTransferSrcScalarPerVector": 1,
+              "BBlockTransferSrcScalarPerVector": 8,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 4
+            }
+          ]
+        },
+        
+        "FP16_32x32_optimized_loads": {
+          "data_type": "FP16",
+          "xdl_size": "32x32",
+          "variant": "optimized memory loads",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_data_xdl_f16_optimized_loads_instances",
+          "instances": [
+            {
+              "BlockSize": 256,
+              "MPerBlock": 128,
+              "NPerBlock": 32,
+              "K0PerBlock": 64,
+              "AK1": 16,
+              "BK1": 16,
+              "MPerXDL": 32,
+              "NPerXDL": 32,
+              "MXdlPerWave": 1,
+              "NXdlPerWave": 1,
+              "ABlockTransferSrcScalarPerVector": 16,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 8,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 8
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 128,
+              "NPerBlock": 32,
+              "K0PerBlock": 32,
+              "AK1": 8,
+              "BK1": 8,
+              "MXdlPerWave": 1,
+              "NXdlPerWave": 1,
+              "ABlockTransferSrcScalarPerVector": 8,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 8,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 8
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 128,
+              "NPerBlock": 32,
+              "K0PerBlock": 16,
+              "AK1": 4,
+              "BK1": 4,
+              "MXdlPerWave": 1,
+              "NXdlPerWave": 1,
+              "ABlockTransferSrcScalarPerVector": 4,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 8,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 8
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 128,
+              "NPerBlock": 32,
+              "K0PerBlock": 64,
+              "AK1": 16,
+              "BK1": 16,
+              "ABlockTransferSrcScalarPerVector": 16,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 2,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 2
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 128,
+              "NPerBlock": 32,
+              "K0PerBlock": 32,
+              "ABlockTransferSrcScalarPerVector": 8,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 2,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 2
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 128,
+              "NPerBlock": 32,
+              "K0PerBlock": 16,
+              "ABlockTransferSrcScalarPerVector": 4,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 2,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 2
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 128,
+              "NPerBlock": 32,
+              "K0PerBlock": 64,
+              "ABlockTransferSrcScalarPerVector": 16,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 1,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 1
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 128,
+              "NPerBlock": 32,
+              "K0PerBlock": 32,
+              "ABlockTransferSrcScalarPerVector": 8,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 1,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 1
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 128,
+              "NPerBlock": 32,
+              "K0PerBlock": 16,
+              "ABlockTransferSrcScalarPerVector": 4,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 1,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 1
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 64,
+              "ABlockTransferSrcScalarPerVector": 16,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 8,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 4
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 32,
+              "ABlockTransferSrcScalarPerVector": 8,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 8,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 4
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 16,
+              "ABlockTransferSrcScalarPerVector": 4,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 8,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 4
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 64,
+              "ABlockTransferSrcScalarPerVector": 16,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 2,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 2
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 32,
+              "ABlockTransferSrcScalarPerVector": 8,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 2,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 2
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 16,
+              "ABlockTransferSrcScalarPerVector": 4,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 2,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 2
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 64,
+              "ABlockTransferSrcScalarPerVector": 16,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 1,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 1
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 32,
+              "ABlockTransferSrcScalarPerVector": 8,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 1,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 1
+            },
+            {
+              "BlockSize": 256,
+              "MPerBlock": 64,
+              "NPerBlock": 16,
+              "K0PerBlock": 16,
+              "ABlockTransferSrcScalarPerVector": 4,
+              "ABlockTransferDstScalarPerVector_K1": 4,
+              "BBlockTransferSrcScalarPerVector": 1,
+              "BBlockTransferDstScalarPerVector_K1": 1,
+              "CDEBlockTransferScalarPerVector_NPerBlock": 1
+            }
+          ]
+        },
+        
+        "FP16_32x32_standard": {
+          "data_type": "FP16",
+          "xdl_size": "32x32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_data_xdl_f16_instances",
+          "instances": [
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 32, "AK1": 8, "BK1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 1},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 1, "CDEBlockTransferScalarPerVector_NPerBlock": 1},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 4, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 4, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 1, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 1, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 32, "MXdlPerWave": 2, "NXdlPerWave": 1, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}
+          ]
+        }
+      }
+    },
+    
+    "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle": {
+      "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp",
+      "instruction_set": "WMMA",
+      "status": "active",
+      
+      "instances": {
+        "FP16": {
+          "data_type": "FP16",
+          "accumulator": "FP32",
+          "cshuffle_type": "FP16",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_f16_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_data_wmma_f16_instances",
+          "all_instances": [
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MPerWmma": 16, "NPerWmma": 16, "MRepeat": 2, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 1},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 256, "K0PerBlock": 8, "K1": 8, "MRepeat": 2, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 1, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 8, "K1": 8, "MRepeat": 2, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 256, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 2, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 32, "MPerBlock": 16, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 32, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 8, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 32, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 8, "K1": 8, "MRepeat": 2, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 32, "MPerBlock": 16, "NPerBlock": 32, "K0PerBlock": 8, "K1": 8, "MRepeat": 1, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}
+          ]
+        },
+        
+        "INT8": {
+          "data_type": "INT8",
+          "accumulator": "INT32",
+          "cshuffle_type": "INT8",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_i8_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_data_wmma_i8_instances",
+          "all_instances": [
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 16, "MRepeat": 2, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 1},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 256, "K0PerBlock": 8, "K1": 16, "MRepeat": 2, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 256, "K0PerBlock": 8, "K1": 16, "MRepeat": 2, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 8, "K1": 16, "MRepeat": 2, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 8, "K1": 16, "MRepeat": 4, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 256, "K0PerBlock": 8, "K1": 16, "MRepeat": 1, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 8, "K1": 16, "MRepeat": 8, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 8, "K1": 16, "MRepeat": 1, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 8, "K1": 16, "MRepeat": 2, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 8, "K1": 16, "MRepeat": 1, "NRepeat": 8, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 8, "K1": 16, "MRepeat": 1, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 32, "MPerBlock": 16, "NPerBlock": 64, "K0PerBlock": 8, "K1": 16, "MRepeat": 1, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 32, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 8, "K1": 16, "MRepeat": 4, "NRepeat": 4, "CDEBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 32, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 8, "K1": 16, "MRepeat": 2, "NRepeat": 2, "CDEBlockTransferScalarPerVector_NPerBlock": 8}
+          ]
+        }
+      }
+    }
+  },
+  
+  "instance_count_summary": {
+    "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1": {
+      "FP16": {
+        "generic": 1,
+        "16x16_variants": 6,
+        "32x32_standard": 13,
+        "32x32_optimized_loads": 18,
+        "32x32_nchw": 17,
+        "total_per_layout_specialization": "~50+"
+      },
+      "BF16": "Similar to FP16 (~50+ instances)",
+      "FP32": "~40+ instances (K1=4 instead of 8)",
+      "FP32_TF32": "~30+ instances with TF32 compute",
+      "FP16_comp_BF8_F8": "~20 instances for newer GPUs",
+      "total_estimated": "~200+"
+    },
+    "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle": {
+      "FP16": 17,
+      "INT8": 13,
+      "total": 30
+    }
+  },
+  
+  "notes": {
+    "instance_variations": "Each instance template can be instantiated for multiple layouts and specializations",
+    "layout_multiplier": "Each instance set is typically instantiated for 2-4 layout combinations",
+    "specialization_multiplier": "Each layout may have 1-2 specializations (Default, Filter1x1Stride1Pad0)",
+    "total_instances": "The actual number of compiled instances is significantly higher when accounting for all layout and specialization combinations"
+  }
+}
diff --git a/experimental/builder/instances/grouped_conv_bwd_weight_instances.json b/experimental/builder/instances/grouped_conv_bwd_weight_instances.json
new file mode 100644
index 0000000000..b4dacc5415
--- /dev/null
+++ b/experimental/builder/instances/grouped_conv_bwd_weight_instances.json
@@ -0,0 +1,648 @@
+{
+  "description": "Complete listing of all backward weight convolution instances in the Composable Kernel library",
+  "note": "All instances are pre-compiled and part of the static library. All 9 algorithms are active.",
+  
+  "algorithms": {
+    "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3": {
+      "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp",
+      "instruction_set": "XDL/MFMA",
+      "version": "v3",
+      "status": "active_recommended",
+      
+      "instances": {
+        "FP32": {
+          "data_type": "FP32",
+          "accumulator": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances",
+          "instance": {
+            "BlockSize": 64,
+            "MPerBlock": 16,
+            "NPerBlock": 16,
+            "K0PerBlock": 32,
+            "K1": 8,
+            "MPerXDL": 16,
+            "NPerXDL": 16,
+            "MXdlPerWave": 1,
+            "NXdlPerWave": 1,
+            "CBlockTransferScalarPerVector_NWaveNPerXdl": 2
+          }
+        },
+        
+        "FP32_TF32": {
+          "data_type": "FP32",
+          "accumulator": "FP32",
+          "compute_type": "TF32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_tf32_instances",
+          "instance": {
+            "BlockSize": 64,
+            "MPerBlock": 16,
+            "NPerBlock": 16,
+            "K0PerBlock": 32,
+            "K1": 8,
+            "MPerXDL": 16,
+            "NPerXDL": 16,
+            "MXdlPerWave": 1,
+            "NXdlPerWave": 1,
+            "ComputeTypeA": "TF32",
+            "ComputeTypeB": "TF32"
+          }
+        },
+        
+        "FP16": {
+          "data_type": "FP16",
+          "accumulator": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances",
+          "all_instances": [
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 4, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 80, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 4, "NXdlPerWave": 5, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2, "note": "Irregular NPerBlock"},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 112, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 4, "NXdlPerWave": 7, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2, "note": "Irregular NPerBlock"}
+          ]
+        },
+        
+        "BF16": {
+          "data_type": "BF16",
+          "accumulator": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances",
+          "all_instances": [
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 80, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 4, "NXdlPerWave": 5},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 112, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 4, "NXdlPerWave": 7}
+          ]
+        }
+      }
+    },
+    
+    "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3": {
+      "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp",
+      "instruction_set": "WMMA",
+      "version": "v3",
+      "status": "active_recommended",
+      
+      "instances": {
+        "FP16": {
+          "data_type": "FP16",
+          "accumulator": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_v3_wmma_c_shuffle_f16_instances",
+          "all_instances": [
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerWmma": 16, "NPerWmma": 16, "MRepeat": 2, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 2},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "Extra LDS"},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 64, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 48, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 3, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "Irregular MPerBlock=48"},
+            {"BlockSize": 128, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 64, "K1": 8, "MRepeat": 6, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "Irregular MPerBlock=96"},
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 4, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 128, "K1": 8, "MRepeat": 6, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "Irregular MPerBlock=96"}
+          ]
+        },
+        
+        "BF16": {
+          "data_type": "BF16",
+          "accumulator": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_v3_wmma_c_shuffle_bf16_instances",
+          "all_instances": [
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MRepeat": 2, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 2},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "Extra LDS"},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 64, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 48, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 3, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 64, "K1": 8, "MRepeat": 6, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 4, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 128, "K1": 8, "MRepeat": 6, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 96, "MPerBlock": 96, "NPerBlock": 96, "K0PerBlock": 48, "K1": 8, "MRepeat": 6, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8}
+          ]
+        }
+      }
+    },
+    
+    "DeviceGroupedConvBwdWeight_Xdl_CShuffle": {
+      "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp",
+      "instruction_set": "XDL/MFMA",
+      "version": "v1",
+      "status": "active_legacy",
+      
+      "instances": {
+        "FP32_generic": {
+          "data_type": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_generic_instances",
+          "instance": {
+            "BlockSize": 64,
+            "MPerBlock": 64,
+            "NPerBlock": 64,
+            "K0PerBlock": 4,
+            "K1": 4,
+            "MPerXDL": 32,
+            "NPerXDL": 32,
+            "MXdlPerWave": 2,
+            "NXdlPerWave": 2,
+            "CBlockTransferScalarPerVector_NWaveNPerXdl": 1
+          }
+        },
+        
+        "FP32": {
+          "data_type": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances",
+          "all_instances": [
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 8, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1}
+          ]
+        },
+        
+        "FP16": {
+          "data_type": "FP16",
+          "accumulator": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances",
+          "all_instances": [
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 256, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}
+          ]
+        },
+        
+        "BF16": {
+          "data_type": "BF16",
+          "accumulator": "FP32",
+          "note": "Similar instance pattern as FP16, ~15 instances"
+        },
+        
+        "FP32_TF32": {
+          "data_type": "FP32",
+          "compute_type": "TF32",
+          "note": "Similar instance pattern, ~20 instances with TF32 compute"
+        }
+      }
+    },
+    
+    "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle": {
+      "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp",
+      "instruction_set": "XDL/MFMA",
+      "version": "two_stage",
+      "status": "active",
+      
+      "instances": {
+        "FP16_NHWGC_generic": {
+          "data_type": "FP16",
+          "layout": "NHWGC/GKYXC/NHWGK",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances",
+          "instance": {
+            "BlockSize": 64,
+            "MPerBlock": 16,
+            "NPerBlock": 16,
+            "K0PerBlock": 32,
+            "K1": 8,
+            "MPerXDL": 16,
+            "NPerXDL": 16,
+            "MXdlPerWave": 1,
+            "NXdlPerWave": 1,
+            "NumGroupsToMerge": 1
+          }
+        },
+        
+        "FP16_NHWGC": {
+          "data_type": "FP16",
+          "layout": "NHWGC/GKYXC/NHWGK",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances",
+          "all_instances": [
+            {"BlockSize": 64, "MPerBlock": 16, "NPerBlock": 16, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 2, "NumGroupsToMerge": 2},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 2},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "NumGroupsToMerge": 4},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 4, "NumGroupsToMerge": 8},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 2},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "NumGroupsToMerge": 4},
+            {"BlockSize": 64, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 1, "NumGroupsToMerge": 8}
+          ]
+        },
+        
+        "FP16_NHWGC_part2": {
+          "data_type": "FP16",
+          "layout": "NHWGC/GKYXC/NHWGK",
+          "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_part2_instances",
+          "all_instances": [
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 1, "NumGroupsToMerge": 1},
+            {"BlockSize": 64, "MPerBlock": 16, "NPerBlock": 256, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 1, "NXdlPerWave": 16, "NumGroupsToMerge": 8},
+            {"BlockSize": 64, "MPerBlock": 16, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 8, "NumGroupsToMerge": 4}
+          ]
+        },
+        
+        "FP16_NHWGC_irregular": {
+          "data_type": "FP16",
+          "layout": "NHWGC/GKYXC/NHWGK",
+          "variant": "irregular block sizes",
+          "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_irregular_instances",
+          "all_instances": [
+            {"BlockSize": 64, "MPerBlock": 48, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MPerXDL": 16, "NPerXDL": 16, "MXdlPerWave": 3, "NXdlPerWave": 4, "NumGroupsToMerge": 1, "note": "Irregular MPerBlock=48"},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 48, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 3, "NumGroupsToMerge": 1, "note": "Irregular NPerBlock=48"},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 80, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 5, "NumGroupsToMerge": 1, "note": "Irregular NPerBlock=80"},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 112, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 7, "NumGroupsToMerge": 1, "note": "Irregular NPerBlock=112"},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 208, "K0PerBlock": 32, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 13, "NumGroupsToMerge": 1, "note": "Irregular NPerBlock=208"}
+          ]
+        },
+        
+        "BF16_NHWGC": {
+          "data_type": "BF16",
+          "layout": "NHWGC/GKYXC/NHWGK",
+          "note": "Similar patterns as FP16, multiple pipeline versions (v1, v2, v5)"
+        }
+      }
+    },
+    
+    "DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3": {
+      "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp",
+      "instruction_set": "WMMA",
+      "version": "two_stage_v3",
+      "status": "active",
+      
+      "instances": {
+        "FP16": {
+          "data_type": "FP16",
+          "accumulator": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_wmma_c_shuffle_f16_instances",
+          "instance": {
+            "BlockSize": 32,
+            "MPerBlock": 16,
+            "NPerBlock": 16,
+            "K0PerBlock": 32,
+            "K1": 8,
+            "MPerWmma": 16,
+            "NPerWmma": 16,
+            "MRepeat": 1,
+            "NRepeat": 1,
+            "NumGroupsToMerge": 1,
+            "CBlockTransferScalarPerVector_NPerBlock": 1
+          }
+        },
+        
+        "BF16": {
+          "data_type": "BF16",
+          "accumulator": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_two_stage_nhwgc_wmma_c_shuffle_bf16_instances",
+          "instance": {
+            "BlockSize": 32,
+            "MPerBlock": 16,
+            "NPerBlock": 16,
+            "K0PerBlock": 32,
+            "K1": 8,
+            "MRepeat": 1,
+            "NRepeat": 1,
+            "NumGroupsToMerge": 1
+          }
+        }
+      }
+    },
+    
+    "DeviceGroupedConvBwdWeight_DL": {
+      "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp",
+      "instruction_set": "Direct Load",
+      "version": "dl",
+      "status": "active",
+      
+      "instances": {
+        "FP32": {
+          "data_type": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_dl_f32_instances",
+          "instance": {
+            "BlockSize": 256,
+            "MPerBlock": 128,
+            "NPerBlock": 128,
+            "K0PerBlock": 16,
+            "K1": 1,
+            "M1PerThread": 4,
+            "N1PerThread": 4,
+            "KPerThread": 1
+          }
+        },
+        
+        "FP16": {
+          "data_type": "FP16",
+          "accumulator": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_dl_f16_instances",
+          "instance": {
+            "BlockSize": 256,
+            "MPerBlock": 128,
+            "NPerBlock": 128,
+            "K0PerBlock": 16,
+            "K1": 1,
+            "M1PerThread": 4,
+            "N1PerThread": 4,
+            "KPerThread": 1
+          }
+        },
+        
+        "BF16_mixed": {
+          "data_type": "BF16",
+          "weight_type": "FP32",
+          "output_type": "BF16",
+          "accumulator": "FP32",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_dl_bf16_instances",
+          "instance": {
+            "BlockSize": 256,
+            "MPerBlock": 128,
+            "NPerBlock": 128,
+            "K0PerBlock": 16,
+            "K1": 1,
+            "note": "Mixed precision: BF16 input/output, FP32 weights"
+          }
+        }
+      }
+    },
+    
+    "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle": {
+      "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp",
+      "instruction_set": "XDL/MFMA",
+      "version": "multiple_d",
+      "status": "active",
+      "fused_operations": ["Bilinear", "Scale"],
+      
+      "instances": {
+        "FP32_Bilinear": {
+          "data_type": "FP32",
+          "element_wise_op": "Bilinear",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_bilinear_instances",
+          "all_instances": [
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MPerXDL": 32, "NPerXDL": 32, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1, "DsTensors": "Tuple<BLayout>"},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 4, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 4}
+          ]
+        },
+        
+        "FP16_Bilinear": {
+          "data_type": "FP16",
+          "element_wise_op": "Bilinear",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_bilinear_instances",
+          "all_instances": [
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 1, "DsTensors": "Tuple<BLayout>"},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 2},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 256, "MPerBlock": 256, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 4, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 4, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 256, "MPerBlock": 64, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 128, "MPerBlock": 32, "NPerBlock": 128, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 32, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 2, "NXdlPerWave": 1, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 64, "K0PerBlock": 4, "K1": 8, "MXdlPerWave": 1, "NXdlPerWave": 2, "CBlockTransferScalarPerVector_NWaveNPerXdl": 8}
+          ]
+        }
+      }
+    },
+    
+    "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3": {
+      "implementation_file": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp",
+      "instruction_set": "WMMA",
+      "version": "multiple_d_v3",
+      "status": "active",
+      "fused_operations": ["Scale"],
+      
+      "instances": {
+        "FP16_Scale": {
+          "data_type": "FP16",
+          "accumulator": "FP32",
+          "element_wise_op": "Scale",
+          "header_file": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_scale_instance.hpp",
+          "template_alias": "device_grouped_conv_bwd_weight_wmma_c_shuffle_f16_scale_instances",
+          "all_instances": [
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 32, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 2},
+            {"BlockSize": 64, "MPerBlock": 32, "NPerBlock": 32, "K0PerBlock": 32, "K1": 8, "MRepeat": 2, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 2},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 128, "NPerBlock": 128, "K0PerBlock": 32, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8, "note": "With LDS extra"},
+            {"BlockSize": 64, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 64, "K1": 8, "MRepeat": 4, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 128, "NPerBlock": 256, "K0PerBlock": 64, "K1": 8, "MRepeat": 8, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 48, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 3, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 64, "K1": 8, "MRepeat": 6, "NRepeat": 2, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 128, "MPerBlock": 64, "NPerBlock": 64, "K0PerBlock": 128, "K1": 8, "MRepeat": 4, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8},
+            {"BlockSize": 256, "MPerBlock": 96, "NPerBlock": 128, "K0PerBlock": 128, "K1": 8, "MRepeat": 6, "NRepeat": 1, "CBlockTransferScalarPerVector_NPerBlock": 8}
+          ]
+        },
+        
+        "BF16_Scale": {
+          "data_type": "BF16",
+          "weight_type": "FP32",
+          "output_type": "BF16",
+          "accumulator": "FP32",
+          "element_wise_op": "Scale",
+          "note": "Similar instance pattern as FP16, ~13 instances with mixed precision"
+        }
+      }
+    }
+  },
+  
+  "instance_count_summary": {
+    "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3": {
+      "FP16": 7,
+      "BF16": 7,
+      "FP32": 1,
+      "FP32_TF32": 1,
+      "total": "~16 base instances, multiply by layouts and pipeline variants"
+    },
+    "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3": {
+      "FP16": 9,
+      "BF16": 10,
+      "total": 19
+    },
+    "DeviceGroupedConvBwdWeight_Xdl_CShuffle": {
+      "FP32": 17,
+      "FP32_TF32": 20,
+      "FP16": 16,
+      "BF16": 15,
+      "BF16_F32_BF16_mixed": 14,
+      "FP16_comp_BF8_F8": 15,
+      "total": "~100+"
+    },
+    "DeviceGroupedConvBwdWeight_Wmma_CShuffle": {
+      "note": "3D only, limited instances"
+    },
+    "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle": {
+      "FP16_NHWGC": 15,
+      "FP16_NHWGC_irregular": 5,
+      "FP16_NGCHW": 15,
+      "BF16_NHWGC": 15,
+      "BF16_NGCHW": 15,
+      "total": "~65+ (across pipev1/v2/v5 and layouts)"
+    },
+    "DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3": {
+      "FP16": 1,
+      "BF16": 1,
+      "total": 2
+    },
+    "DeviceGroupedConvBwdWeight_DL": {
+      "FP32": 1,
+      "FP16": 1,
+      "BF16_mixed": 1,
+      "total": "3 base (multiply by layouts: ~6-9)"
+    },
+    "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle": {
+      "FP32_Bilinear": 15,
+      "FP32_TF32_Bilinear": 14,
+      "FP16_Bilinear": 16,
+      "BF16_Bilinear": 15,
+      "FP16_comp_BF8_F8_Bilinear": 15,
+      "note": "Similar counts for Scale variants",
+      "total": "~100+"
+    },
+    "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3": {
+      "FP16_Scale": 10,
+      "BF16_Scale": 13,
+      "total": 23
+    }
+  },
+  
+  "file_references_by_algorithm": {
+    "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3": {
+      "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp",
+      "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp",
+      "cpp_files": [
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/gnhwc_gkyxc_gnhwk/device_grouped_conv2d_bwd_weight_v3_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_v3_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev1_instance.cpp"
+      ]
+    },
+    "DeviceGroupedConvBwdWeight_Wmma_CShuffleV3": {
+      "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp",
+      "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_wmma_instance.hpp",
+      "cpp_files": [
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_wmma_nhwgc_gkyxc_nhwgk_bf16_instance.cpp"
+      ]
+    },
+    "DeviceGroupedConvBwdWeight_Xdl_CShuffle": {
+      "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp",
+      "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp",
+      "cpp_files": [
+        "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_*.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/*/device_grouped_conv2d_bwd_weight_xdl_*_*.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/*.cpp"
+      ]
+    },
+    "DeviceGroupedConvBwdWeight_TwoStage_Xdl_CShuffle": {
+      "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp",
+      "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp",
+      "cpp_files": [
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_*_pipev*.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkcyx_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkcyx_ngkhw_*_pipev*.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/ngchw_gkyxc_ngkhw/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_*_pipev*.cpp"
+      ]
+    },
+    "DeviceGroupedConvBwdWeight_TwoStage_Wmma_CShuffleV3": {
+      "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp",
+      "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_wmma_instance.hpp",
+      "cpp_files": [
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/wmma/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_two_stage_wmma_nhwgc_gkyxc_nhwgk_*_pipev1_instance.cpp"
+      ]
+    },
+    "DeviceGroupedConvBwdWeight_DL": {
+      "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp",
+      "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp",
+      "cpp_files": [
+        "library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/dl/device_grouped_conv1d_bwd_weight_dl_*.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/dl/device_grouped_conv2d_bwd_weight_dl_*.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/dl/device_grouped_conv3d_bwd_weight_dl_*.cpp"
+      ]
+    },
+    "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle": {
+      "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp",
+      "instances_headers": [
+        "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp",
+        "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp"
+      ],
+      "cpp_files": [
+        "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/*.cpp",
+        "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/*.cpp"
+      ]
+    },
+    "DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3": {
+      "implementation": "include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp",
+      "instances_header": "library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_scale_instance.hpp",
+      "cpp_files": [
+        "library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/wmma/*.cpp"
+      ]
+    }
+  },
+  
+  "notes": {
+    "total_instances": "Estimated 300-400+ backward weight instances across all 9 algorithms",
+    "instance_multipliers": "Each instance template is instantiated for multiple layout combinations and specializations",
+    "pipeline_variants": "Two-stage and V3 algorithms have multiple pipeline versions, multiplying instance count",
+    "layout_support": "V1 XDL supports more layouts (including NCHW) than V3",
+    "irregular_blocks": "Two-stage and V3 variants support non-power-of-2 block sizes (48, 80, 96, 112, 208)",
+    "fused_operations": "MultipleD variants support Bilinear and Scale fused operations with additional D tensors"
+  }
+}