diff --git a/example/ck_tile/20_grouped_convolution/conv_configs.hpp b/example/ck_tile/20_grouped_convolution/conv_configs.hpp index a27c7980be..0799362860 100644 --- a/example/ck_tile/20_grouped_convolution/conv_configs.hpp +++ b/example/ck_tile/20_grouped_convolution/conv_configs.hpp @@ -281,7 +281,9 @@ template <> struct PipelineTypeTraits { template - using GemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + using GemmPipeline = + ck_tile::GemmPipelineAGmemBGmemCRegV1; template using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAGmemBGmemCRegV1; }; @@ -290,7 +292,9 @@ template <> struct PipelineTypeTraits { template - using GemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV2; + using GemmPipeline = + ck_tile::GemmPipelineAGmemBGmemCRegV2; template using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAGmemBGmemCRegV2; }; @@ -299,7 +303,9 @@ template <> struct PipelineTypeTraits { template - using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem; + using GemmPipeline = + ck_tile::GemmPipelineAgBgCrMem; template using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem; }; @@ -308,7 +314,9 @@ template <> struct PipelineTypeTraits { template - using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3; + using GemmPipeline = + ck_tile::GemmPipelineAgBgCrCompV3; template using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3; }; diff --git a/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tuning_params.hpp b/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tuning_params.hpp index efef0f8d56..8bc7de633a 100644 --- a/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tuning_params.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tuning_params.hpp @@ -60,21 +60,26 @@ template <> struct TilePipelineType { template - using GemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + using GemmPipeline = + ck_tile::GemmPipelineAGmemBGmemCRegV1; }; template <> struct TilePipelineType { template - using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem; + using GemmPipeline = + ck_tile::GemmPipelineAgBgCrMem; }; template <> struct TilePipelineType { template - using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3; + using GemmPipeline = + ck_tile::GemmPipelineAgBgCrCompV3; }; template <> diff --git a/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp b/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp index b0d5b2f8bb..f383feabf0 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp @@ -4,6 +4,8 @@ #pragma once #include "ck_tile/builder/testing/testing.hpp" +#include "ck_tile/builder/testing/conv/fwd.hpp" +#include "ck_tile/builder/testing/conv/bwd_weight.hpp" #include "ck_tile/host/kernel_launch.hpp" #include "ck_tile/ops/gemm.hpp" #include "ck_tile/ops/grouped_convolution.hpp" diff --git a/experimental/grouped_convolution_tile_instances/CMakeLists.txt b/experimental/grouped_convolution_tile_instances/CMakeLists.txt index 9a75fdcff6..a2a4568c5d 100644 --- a/experimental/grouped_convolution_tile_instances/CMakeLists.txt +++ b/experimental/grouped_convolution_tile_instances/CMakeLists.txt @@ -2,19 +2,35 @@ # SPDX-License-Identifier: MIT if(GPU_TARGETS MATCHES "gfx9") - # Generate instances using python script (empty to just generate empty instance list) - if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/grouped_convolution_forward_tile_ndhwgc_fp32.inc) + # Generate instances using python script if instance directories don't exist + set(INSTANCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/instances) + if(NOT EXISTS ${INSTANCES_DIR}/forward OR + NOT EXISTS ${INSTANCES_DIR}/backward_weight OR + NOT EXISTS ${INSTANCES_DIR}/backward_data) find_package(Python3 COMPONENTS Interpreter Development) execute_process( COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_instances.py --mode=tests RESULT_VARIABLE ret + OUTPUT_VARIABLE output + ERROR_VARIABLE error ) + + if(NOT ret EQUAL 0) + message(FATAL_ERROR "Failed to generate instances. Return code: ${ret}\nOutput: ${output}\nError: ${error}") + endif() endif() # Find cpp files and create lib for instances - file(GLOB_RECURSE GROUPED_CONV_FWD_TILE "instances/*.cpp") + file(GLOB_RECURSE GROUPED_CONV_FWD_TILE "instances/forward/*.cpp") add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE}) target_include_directories(device_grouped_conv_fwd_tile_instances PRIVATE "${PROJECT_SOURCE_DIR}/experimental/builder/test/utils") target_compile_options(device_grouped_conv_fwd_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0) + + file(GLOB_RECURSE GROUPED_CONV_BWD_WEIGHT_TILE "instances/backward_weight/*.cpp") + add_instance_library(device_grouped_conv_bwd_weight_tile_instances ${GROUPED_CONV_BWD_WEIGHT_TILE}) + target_include_directories(device_grouped_conv_bwd_weight_tile_instances PRIVATE + "${PROJECT_SOURCE_DIR}/experimental/builder/test/utils") + + target_compile_options(device_grouped_conv_bwd_weight_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0) endif() diff --git a/experimental/grouped_convolution_tile_instances/README.md b/experimental/grouped_convolution_tile_instances/README.md index 1ba5189695..afae70f1b5 100644 --- a/experimental/grouped_convolution_tile_instances/README.md +++ b/experimental/grouped_convolution_tile_instances/README.md @@ -1,5 +1,555 @@ -# Grouped Convolution Tile Instances Generator -CK Tile Convolution instances implemented via builder and generated via python script. -It is integrated with tests and ckProfiler -This functionality will be refactored and moved under the Tile Engine. -At now to speed up development and provide tests for CK Tile Convolution it has been implemented under experimental directory. +# CK Tile Instance Generation and Integration + +## Table of Contents +1. [Overview](#overview) +2. [Architecture](#architecture) +3. [Instance Generation Workflow](#instance-generation-workflow) +4. [Configuration Files](#configuration-files) +5. [Python Generation Script](#python-generation-script) +6. [Generated Artifacts](#generated-artifacts) +7. [Integration with CK Profiler](#integration-with-ck-profiler) +8. [Directory Structure](#directory-structure) +9. [Usage](#usage) + +--- + +## Overview + +The CK Tile instance generation system provides an automated way to create optimized convolution kernel instances using the **CK Builder** pattern. These instances are: + +- **Generated** from configuration files containing instance parameter strings +- **Integrated** with the CK Profiler for benchmarking and validation + +### Key Components + +1. **CK Builder** (`/projects/composablekernel/experimental/builder`) + - High-level C++20 interface for constructing composable kernel operations + - Provides compile-time dispatch from builder descriptors to specialized kernel implementations + +2. **Instance Generator** (`/projects/composablekernel/experimental/grouped_convolution_tile_instances`) + - Python-based code generation system + - Parses configuration files with instance strings + - Generates C++ wrapper files using templates + +3. **CK Profiler Integration** (`projects/composablekernel/profiler`) + - Benchmarks generated instances + - Validates correctness against reference implementations + - Selects best-performing kernels + +--- + +## Architecture + +### CK Builder Design + +The CK Builder uses a **builder pattern** that separates: + +1. **Signature** - Defines the operation (data type, layout, direction) +2. **Algorithm** - Specifies tile parameters and optimizations +3. **Instance** - The compiled kernel from Builder + Algorithm + +```cpp +// Example: Building a convolution instance +using Builder = ckb::ConvBuilder; +using Instance = Builder::Instance; + +auto conv = Instance{}; +ckt::RunResult result = ckt::run(conv, args, inputs, outputs, stream_config); +``` + +### Convolution Signatures + +Signatures are compile-time constants that define the operation: + +```cpp +constexpr auto SIGNATURE_NHWGC_FP16_FWD = ckt::ConvSignature{ + .spatial_dim = 2, // 2D convolution + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}} +}; +``` + +### Tile Algorithm Configuration + +Algorithms specify tile sizes, GEMM parameters, and optimizations: + +```cpp +constexpr auto ALGORITHM = cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{} + .with_tile_specializations(ckb::TileConvSpecialization::DEFAULT) + .with_tile_thread_block(ckt::TileThreadBlock{ + .tile_size = {.m = 128, .n = 128, .k = 32} + }) + .with_tile_block_gemm(ckt::TileBlockGemm{ + .warps = {.m = 2, .n = 2, .k = 1}, + .warp_tile = {.m = 32, .n = 32, .k = 16}, + .double_smem_buffer = false, + .num_wave_groups = 1, + .pipeline_version = ckb::PipelineVersion::V1, + .scheduler = ckb::PipelineScheduler::INTRAWAVE + }) + .with_tile_transfer(ckt::TileTransfer{ + .a_scalar_per_vector = 8, + .b_scalar_per_vector = 8, + .c_scalar_per_vector = 8 + }) + .with_tile_optimizations(ckt::TileOptimizations{ + .num_groups_to_merge = 1, + .split_image = false, + .explicit_gemm = false + }); +``` + +--- + +## Instance Generation Workflow + +### Step 1: Configuration Files + +Instance strings are defined in configuration files organized by: +- **Direction**: `forward`, `backward_weight`, `backward_data` +- **Purpose**: `profiler` (all instances), `tests` (limited set), `compilation` (empty) +- **Layout & Data Type**: e.g., `nhwgc_fp16.conf`, `ndhwgc_bf16.conf` + +**Location**: `configs/{direction}/{purpose}/{layout_dtype}.conf` + +### Step 2: Python Generation + +Run `generate_instances.py` to parse configs and generate C++ files: + +```bash +python generate_instances.py \ + --mode profiler \ + --direction all \ + --filter_pattern convolution +``` + +### Step 3: Generated Files + +For each instance, the script generates: + +1. **Individual C++ files** (one per instance) + - Location: `instances/{direction}/{config}/{instance_name}.cpp` + - Contains instance-specific kernel wrapper + +2. **Include files** (`.inc` headers) + - `{problem_name}.inc` - Function declarations + - `{problem_name}_calls.inc` - Function call invocations + +3. **CMake integration** (via `CMakeLists.txt`) + - Compiles all generated instances + - Links with profiler + +### Step 4: Compilation + +CMake compiles the generated instances with: +- GPU-specific optimizations +- Target architecture (e.g., `gfx942`) +- C++20 standard required + +### Step 5: Profiler Integration + +Generated instances are integrated via include files in profiler headers. + +--- + +## Configuration Files + +### Instance String Format + +Configuration files contain instance strings that define kernel parameters. The format varies by device operation type. + +#### Forward Convolution Example + +``` +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +``` + +**Parameters** (parsed by `parse_fwd_instances`): +1. `block_size` = 64 (total threads per block) +2. `m_per_block` = 64 (M dimension of tile) +3. `n_per_block` = 64 (N dimension of tile) +4. `k_per_block` = 32 (K dimension of tile) +5. `spec` = Default (specialization: Default, Filter1x1Pad0, Filter1x1Stride1Pad0, OddC, Filter3x3) +6. `m_per_xdl` = 32 (M dimension per XDL instruction) +7. `n_per_xdl` = 32 (N dimension per XDL instruction) +8. `m_xdl_per_wave` = 2 (XDL tiles in M per wave) +9. `n_xdl_per_wave` = 2 (XDL tiles in N per wave) +10. `a_scalar_per_vector` = 1 (vectorization for input) +11. `b_scalar_per_vector` = 1 (vectorization for weight) +12. `c_scalar_per_vector` = 1 (vectorization for output) +13-14. Optional pipeline parameters +15. Optional `num_groups_to_merge` + +#### Backward Weight Convolution Example (V3 Instance) + +``` +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +``` + +**Additional Parameters** (V3 instances): +- `BlkGemmPipelineScheduler` - Intrawave or Interwave +- `BlkGemmPipelineVersion` - v1, v2, v3, v4, or v5 + +### Specializations + +- **DEFAULT** - General purpose convolution +- **FILTER_1X1_PAD0** - Optimized for 1x1 filters with no padding +- **FILTER_1X1_STRIDE1_PAD0** - Optimized for 1x1 filters, stride 1, no padding +- **FILTER_3x3** - Optimized for 3x3 filters +- **OddC** - Optimized for odd channel counts + +### Pipeline Versions + +- **v1** - Basic pipeline +- **v2** - Enhanced pipeline with better scheduling +- **v3** - Advanced pipeline optimizations +- **v4** - Double shared memory buffering +- **v5** - Two wave groups (2x parallelism) + +--- + +## Python Generation Script + +### Script: `generate_instances.py` + +#### Key Functions + +1. **`parse_fwd_instances(instances, problem_name)`** + - Parses forward convolution instance strings + - Extracts tile sizes, GEMM parameters, specializations + - Returns list of `ConvInstanceTemplateParams` objects + +2. **`parse_bwd_weight_instances(instances, problem_name)`** + - Parses backward weight convolution instance strings + - Handles V1, V3, and TwoStage variants + - Extracts pipeline scheduler and version parameters + +3. **`parse_bwd_data_instances(instances, problem_name)`** + - Placeholder for backward data parsing (not yet implemented) + +4. **`generate_conv_cpp(instances, problem_name, config, direction, signature_name, filter_pattern)`** + - Generates individual C++ wrapper files from template + - One file per instance + +5. **`generate_defs_inc(instances, problem_name, signature, direction, filter_pattern)`** + - Generates function declarations (`.inc` file) + - Used by profiler to call instances + +6. **`generate_calls_inc(instances, problem_name, direction, filter_pattern)`** + - Generates function call statements (`.inc` file) + - Invokes each instance in profiler benchmark loop + +#### Template System + +**Template**: `instances/grouped_convolution_tile.cpp.in` + +**Placeholders**: +- `gen_signature` → Signature constant name +- `gen_instance_name` → Unique instance function name +- `gen_specialization` → Tile specialization enum +- `gen_thread_block` → Thread block configuration +- `gen_block_gemm_desc` → Block GEMM descriptor +- `gen_block_transfer` → Transfer parameters +- `gen_optimizations` → Optimization settings + +**Generated Output**: `instances/{direction}/{config}/{instance_name}.cpp` + +#### Command-Line Arguments + +```bash +python generate_instances.py \ + --mode {compilation|tests|profiler} \ + --direction {forward|backward_weight|backward_data|all} \ + --filter_pattern {pattern} +``` + +**Modes**: +- `compilation` - Empty instance list (compile-time check only) +- `tests` - Limited instances for testing +- `profiler` - All instances for benchmarking + +--- + +## Generated Artifacts + +### Directory Structure + +``` +instances/ +├── forward/ +│ ├── nhwgc_fp16/ +│ │ ├── grouped_convolution_forward_tile_nhwgc_fp16_0.cpp +│ │ ├── grouped_convolution_forward_tile_nhwgc_fp16_1.cpp +│ │ └── ... +│ ├── grouped_convolution_forward_tile_nhwgc_fp16.inc +│ └── grouped_convolution_forward_tile_nhwgc_fp16_calls.inc +├── backward_weight/ +│ ├── nhwgc_bf16/ +│ │ └── ... +│ └── ... +├── instance_includes.inc # Shared headers and signatures +└── instance_run.inc # Shared instance execution logic +``` + +### File Types + +1. **Instance Implementation** (`.cpp`) + ```cpp + // grouped_convolution_forward_tile_nhwgc_fp16_0.cpp + #include "../../instance_includes.inc" + namespace ck_tile::builder::profiling { + constexpr auto SIGNATURE = SIGNATURE_NHWGC_FP16_FWD; + std::tuple run_grouped_convolution_forward_tile_nhwgc_fp16_0( + const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) + { + constexpr auto ALGORITHM = /* ... */; + #include "../../instance_run.inc" + } + } + ``` + +2. **Function Declarations** (`.inc`) + ```cpp + // grouped_convolution_forward_tile_nhwgc_fp16.inc + std::tuple run_grouped_convolution_forward_tile_nhwgc_fp16_0(...); + std::tuple run_grouped_convolution_forward_tile_nhwgc_fp16_1(...); + // ... + ``` + +3. **Function Calls** (`_calls.inc`) + ```cpp + // grouped_convolution_forward_tile_nhwgc_fp16_calls.inc + run_alg(run_grouped_convolution_forward_tile_nhwgc_fp16_0); + run_alg(run_grouped_convolution_forward_tile_nhwgc_fp16_1); + // ... + ``` + +--- + +## Integration with CK Profiler + +### Profiler Header: `grouped_convolution_forward_tile_algs.hpp` + +This file orchestrates the benchmarking of all CK Tile instances. + +#### Key Components + +1. **Include Generated Instances** + ```cpp + #include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_fp32.inc" + #include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_bf16.inc" + #include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_fp16.inc" + // ... more includes + ``` + +2. **Benchmark Loop** (`run_grouped_conv_forward_tile_algs`) + ```cpp + template + std::tuple run_grouped_conv_forward_tile_algs( + const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) + { + float best_avg_time = std::numeric_limits::max(); + std::string best_op_name; + bool valid = true; + + // Generate reference output + auto reference = ckt::alloc_outputs(args); + using ReferenceInstance = /* ... */; + auto ref_conv = ReferenceInstance{}; + auto ref_result = ckt::run(ref_conv, args, inputs, reference.get()); + + // Lambda to run and validate each instance + auto run_alg = [&](auto&& run_alg_func) { + auto [is_supported, avg_time, op_name] = run_alg_func(args, inputs, outputs, s_conf); + if(is_supported) { + best_avg_time = std::min(best_avg_time, avg_time); + best_op_name = (best_avg_time < avg_time) ? best_op_name : op_name; + + // Validate correctness + valid = ck_tile::check_err(outputs, reference, rtol, atol); + + std::cout << "Perf: " << avg_time << " ms, " << op_name << std::endl; + } + }; + + // Run all instances based on signature + if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP16_FWD) { + #include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" + } + // ... more signature branches + + return std::make_tuple(valid, best_avg_time, best_op_name); + } + ``` + +--- + +## Directory Structure + +``` +projects/composablekernel/ +├── experimental/ +│ ├── builder/ # CK Builder framework +│ │ ├── include/ck_tile/builder/ # Builder API +│ │ │ ├── conv_builder.hpp # Main builder interface +│ │ │ ├── factory/ # Dispatch to kernel implementations +│ │ │ └── reflect/ # Instance traits and reflection +│ │ ├── test/ # Builder tests and utilities +│ │ └── README.md # Builder documentation +│ │ +│ └── grouped_convolution_tile_instances/ # Instance generation system +│ ├── generate_instances.py # Main generation script +│ ├── CMakeLists.txt # Build configuration +│ ├── README.md # Brief overview +│ │ +│ ├── configs/ # Configuration files +│ │ ├── forward/ +│ │ │ ├── profiler/ # All instances for profiling +│ │ │ │ ├── nhwgc_fp16.conf +│ │ │ │ ├── nhwgc_fp32.conf +│ │ │ │ ├── nhwgc_bf16.conf +│ │ │ │ ├── ndhwgc_fp16.conf +│ │ │ │ ├── ndhwgc_fp32.conf +│ │ │ │ └── ndhwgc_bf16.conf +│ │ │ └── tests/ # Limited instances for testing +│ │ ├── backward_weight/ +│ │ │ └── profiler/ +│ │ └── backward_data/ +│ │ └── profiler/ +│ │ +│ └── instances/ # Generated C++ files +│ ├── instance_includes.inc # Shared headers and signatures +│ ├── instance_run.inc # Shared execution logic +│ ├── grouped_convolution_tile.cpp.in # Template file +│ │ +│ ├── forward/ # Forward instances +│ │ ├── nhwgc_fp16/ +│ │ │ ├── grouped_convolution_forward_tile_nhwgc_fp16_0.cpp +│ │ │ ├── grouped_convolution_forward_tile_nhwgc_fp16_1.cpp +│ │ │ └── ... +│ │ ├── grouped_convolution_forward_tile_nhwgc_fp16.inc +│ │ ├── grouped_convolution_forward_tile_nhwgc_fp16_calls.inc +│ │ └── ... +│ │ +│ └── backward_weight/ # Backward weight instances +│ └── ... +│ +└── profiler/ + ├── include/profiler/ + │ ├── grouped_convolution_forward_tile_algs.hpp # Profiler integration + │ └── ... + └── src/ + └── profile_grouped_conv_fwd.cpp # Main profiler entry point +``` + +--- + +## Usage + +### 1: Generate All Instances for Profiling + +```bash +cd projects/composablekernel/experimental/grouped_convolution_tile_instances + +# Generate all forward, backward_weight, and backward_data instances +python generate_instances.py --mode profiler --direction all +``` + +**Output**: +- Generates `.cpp` files for all instances +- Creates `.inc` declaration and call files +- Ready to compile with CMake + +#### 1.1: Generate Only Forward Instances for Testing + +```bash +# Generate limited forward instances from test configs +python generate_instances.py --mode tests --direction forward +``` + +#### 1.2: Filter Specific Instances + +```bash +# Only generate instances matching "fp16" +python generate_instances.py \ + --mode profiler \ + --direction forward \ + --filter_pattern fp16 +``` + +### 2: Compile the Generated Instances + +```bash +cd build +cmake -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ + -D CMAKE_BUILD_TYPE=Release \ + -D GPU_TARGETS="gfx942" \ + -D CK_EXPERIMENTAL_BUILDER=ON \ + -D CMAKE_CXX_STANDARD=20 \ + -G Ninja \ + .. + +ninja device_grouped_conv_fwd_tile_instances +ninja device_grouped_conv_bwd_weight_tile_instances +``` + +### 3: Run the Profiler + +```bash +# Profile 2D convolution (NHWGC layout, FP16 data type) +./bin/ckProfiler conv fwd \ + 1 0 2 \ # data_type=FP16, layout=NHWGC, spatial_dim=2 + 1 128 128 64 \ # G=1, N=128, K=128, C=64 + 3 3 \ # filter: 3x3 + 28 28 \ # input: 28x28 + 1 1 \ # stride: 1x1 + 1 1 \ # dilation: 1x1 + 1 1 1 1 \ # padding: 1,1,1,1 + 1 0 1 # verification, initialization, profiling + +# Output shows performance of each instance and selects best kernel +``` + +## Adding a New Instance Configuration + +1. **Edit config file**: `configs/forward/profiler/nhwgc_fp16.conf` + + ``` + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 64, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> + ``` + +2. **Regenerate instances**: + ```bash + python generate_instances.py --mode profiler --direction forward --filter_pattern fp16 + ``` + +3. **Rebuild**: + ```bash + ninja grouped_convolution_tile_instances + ``` + +4. **Profile**: + ```bash + ./bin/ckProfiler conv fwd ... + ``` + +--- + +## References + +- [CK Builder README](../builder/README.md) +- [CK Builder Design](../builder/include/ck_tile/builder/README.md) +- [CK Builder Factory](../builder/include/ck_tile/builder/factory/README.md) +- [CK Builder Testing](../builder/include/ck_tile/builder/testing/README.md) + +--- diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_bf16.conf new file mode 100644 index 0000000000..4ee0de66d1 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_bf16.conf @@ -0,0 +1,82 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp16.conf new file mode 100644 index 0000000000..4ee0de66d1 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp16.conf @@ -0,0 +1,82 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp32.conf new file mode 100644 index 0000000000..0391d33eb7 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/ndhwgc_fp32.conf @@ -0,0 +1,70 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_bf16.conf new file mode 100644 index 0000000000..4ee0de66d1 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_bf16.conf @@ -0,0 +1,82 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp16.conf new file mode 100644 index 0000000000..4ee0de66d1 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp16.conf @@ -0,0 +1,82 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp32.conf new file mode 100644 index 0000000000..0391d33eb7 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/profiler/nhwgc_fp32.conf @@ -0,0 +1,70 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_bf16.conf new file mode 100644 index 0000000000..c5e1b20cff --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_bf16.conf @@ -0,0 +1,16 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp16.conf new file mode 100644 index 0000000000..c5e1b20cff --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp16.conf @@ -0,0 +1,16 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp32.conf new file mode 100644 index 0000000000..fd3a1bbda8 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/ndhwgc_fp32.conf @@ -0,0 +1,14 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_bf16.conf new file mode 100644 index 0000000000..c5e1b20cff --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_bf16.conf @@ -0,0 +1,16 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp16.conf new file mode 100644 index 0000000000..c5e1b20cff --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp16.conf @@ -0,0 +1,16 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp32.conf new file mode 100644 index 0000000000..fd3a1bbda8 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_data/tests/nhwgc_fp32.conf @@ -0,0 +1,14 @@ +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1> +DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/ndhwgc_bf16.conf new file mode 100644 index 0000000000..d189d6c896 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/ndhwgc_bf16.conf @@ -0,0 +1,233 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,4,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,Seq(8,4,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,1,1,Seq(1,8,1,32),2,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,256,32,8,16,16,1,16,Seq(4,2,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/ndhwgc_fp16.conf new file mode 100644 index 0000000000..4ffe1c24a6 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/ndhwgc_fp16.conf @@ -0,0 +1,233 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,8,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,Seq(8,8,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,1,1,Seq(1,8,1,32),2,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,256,32,8,16,16,1,16,Seq(4,2,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/ndhwgc_fp32.conf new file mode 100644 index 0000000000..4f3da2883a --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/ndhwgc_fp32.conf @@ -0,0 +1,44 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,1,1,Seq(1,16,1,4),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,1,1,true,1,1,Seq(1,32,1,4),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,1,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,4,32,32,4,2,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,4,32,32,4,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,4,32,32,2,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,4,32,32,2,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,4,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,4,32,32,2,1,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,4,32,32,1,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,4,32,32,1,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,1,1,Seq(1,16,1,4),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,1,1,true,1,1,Seq(1,32,1,4),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,1,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,4,32,32,4,2,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,4,32,32,4,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,4,32,32,2,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,4,32,32,2,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,4,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,4,32,32,2,1,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,4,32,32,1,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,4,32,32,1,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp32,fp32,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp32,fp32,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp32,fp32,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp32,fp32,0,1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/nhwgc_bf16.conf new file mode 100644 index 0000000000..f2c7392641 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/nhwgc_bf16.conf @@ -0,0 +1,240 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,32,64,8,16,16,1,1,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,64,64,8,16,16,1,2,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(2,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,32,64,8,32,32,2,1,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(8,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,64,1,4),8,Intrawave,v1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,64,8,32,32,1,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,8),8,Intrawave,v1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,4),8,Intrawave,v1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,4),2,Intrawave,v1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,2,1,false,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,2,1,false,1,1,Seq(1,32,1,4),4,Intrawave,v1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,4,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,Seq(8,4,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,1,1,Seq(1,8,1,32),2,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,256,32,8,16,16,1,16,Seq(4,2,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/nhwgc_fp16.conf new file mode 100644 index 0000000000..563755f4de --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/nhwgc_fp16.conf @@ -0,0 +1,243 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,32,64,8,16,16,1,1,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,32,64,8,16,16,1,1,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,1,2> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,64,64,8,16,16,1,2,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(2,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,64,64,8,16,16,1,2,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(2,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,1,2> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,32,64,8,32,32,2,1,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(8,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,64,1,4),8,Intrawave,v1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,64,8,32,32,1,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,8),8,Intrawave,v1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,64,8,32,32,1,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,8),8,Intrawave,v1,fp16,fp16,1,2> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,4),8,Intrawave,v1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,4),2,Intrawave,v1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,2,1,false,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,2,1,false,1,1,Seq(1,32,1,4),4,Intrawave,v1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,8,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,Seq(8,8,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,1,1,Seq(1,8,1,32),2,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,256,32,8,16,16,1,16,Seq(4,2,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/nhwgc_fp32.conf new file mode 100644 index 0000000000..e9ac3d1072 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/profiler/nhwgc_fp32.conf @@ -0,0 +1,44 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,1,1,Seq(1,16,1,4),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,1,1,true,1,1,Seq(1,32,1,4),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,1,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,4,32,32,4,2,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,4,32,32,4,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,4,32,32,2,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,4,32,32,2,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,4,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,4,32,32,2,1,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,4,32,32,1,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,4,32,32,1,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,1,1,Seq(1,16,1,4),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,1,1,true,1,1,Seq(1,32,1,4),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,1,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,4,32,32,4,2,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,4,32,32,4,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,4,32,32,2,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,4,32,32,2,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,4,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,4,32,32,2,1,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,4,32,32,1,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,4,32,32,1,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp32,fp32,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp32,fp32,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp32,fp32,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp32,fp32,0,1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/ndhwgc_bf16.conf new file mode 100644 index 0000000000..a612050d57 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/ndhwgc_bf16.conf @@ -0,0 +1,46 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/ndhwgc_fp16.conf new file mode 100644 index 0000000000..563b2fbb1f --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/ndhwgc_fp16.conf @@ -0,0 +1,46 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/ndhwgc_fp32.conf new file mode 100644 index 0000000000..ad617ad558 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/ndhwgc_fp32.conf @@ -0,0 +1,8 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/nhwgc_bf16.conf new file mode 100644 index 0000000000..e6430e3430 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/nhwgc_bf16.conf @@ -0,0 +1,48 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,32,64,8,32,32,2,1,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(8,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,64,1,4),8,Intrawave,v1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/nhwgc_fp16.conf new file mode 100644 index 0000000000..9aff77c6e5 --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/nhwgc_fp16.conf @@ -0,0 +1,48 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,64,64,8,16,16,1,2,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(2,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,4),8,Intrawave,v1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> +DeviceGroupedConvBwdWeight_Explicit_Xdl BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2> diff --git a/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/nhwgc_fp32.conf new file mode 100644 index 0000000000..f26738f3ba --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/backward_weight/tests/nhwgc_fp32.conf @@ -0,0 +1,8 @@ +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1> +DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1> diff --git a/experimental/grouped_convolution_tile_instances/configs/create_configs.sh b/experimental/grouped_convolution_tile_instances/configs/create_configs.sh new file mode 100755 index 0000000000..b268328d2e --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/configs/create_configs.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +# Get flag --update-test-configs-only to skip running the CK profiler and update tests based on the existing profiler configs +UPDATE_TEST_CONFIGS_ONLY=false +for arg in "$@"; do + if [ "$arg" == "--update-test-configs-only" ]; then + UPDATE_TEST_CONFIGS_ONLY=true + fi +done + +if [ "$UPDATE_TEST_CONFIGS_ONLY" = false ]; then + + ProfilerPath="../../../build/bin/ckProfiler" + + # Layout: NHWGC-GKYXC-NHWGK (channels last) + fwd_layout=1 + bwd_weight_layout=2 + bwd_data_layout=1 + + # FWD configs + mkdir -p forward/profiler + + # 2D + dim=2 + $ProfilerPath grouped_conv_fwd 0 $fwd_layout $dim --instances > forward/profiler/nhwgc_fp32.conf + $ProfilerPath grouped_conv_fwd 1 $fwd_layout $dim --instances > forward/profiler/nhwgc_fp16.conf + $ProfilerPath grouped_conv_fwd 2 $fwd_layout $dim --instances > forward/profiler/nhwgc_bf16.conf + + # 3D + dim=3 + $ProfilerPath grouped_conv_fwd 2 $fwd_layout $dim --instances > forward/profiler/ndhwgc_bf16.conf + $ProfilerPath grouped_conv_fwd 1 $fwd_layout $dim --instances > forward/profiler/ndhwgc_fp16.conf + $ProfilerPath grouped_conv_fwd 0 $fwd_layout $dim --instances > forward/profiler/ndhwgc_fp32.conf + + # BWD weight configs + mkdir -p backward_weight/profiler + + # 2D + dim=2 + $ProfilerPath grouped_conv_bwd_weight 0 $bwd_weight_layout $dim --instances > backward_weight/profiler/nhwgc_fp32.conf + $ProfilerPath grouped_conv_bwd_weight 1 $bwd_weight_layout $dim --instances > backward_weight/profiler/nhwgc_fp16.conf + $ProfilerPath grouped_conv_bwd_weight 5 $bwd_weight_layout $dim --instances > backward_weight/profiler/nhwgc_bf16.conf + + #3D + dim=3 + $ProfilerPath grouped_conv_bwd_weight 5 $bwd_weight_layout $dim --instances > backward_weight/profiler/ndhwgc_bf16.conf + $ProfilerPath grouped_conv_bwd_weight 1 $bwd_weight_layout $dim --instances > backward_weight/profiler/ndhwgc_fp16.conf + $ProfilerPath grouped_conv_bwd_weight 0 $bwd_weight_layout $dim --instances > backward_weight/profiler/ndhwgc_fp32.conf + + # BWD data configs + mkdir -p backward_data/profiler + + # 2D + dim=2 + $ProfilerPath grouped_conv_bwd_data 0 $bwd_data_layout $dim --instances > backward_data/profiler/nhwgc_fp32.conf + $ProfilerPath grouped_conv_bwd_data 1 $bwd_data_layout $dim --instances > backward_data/profiler/nhwgc_fp16.conf + $ProfilerPath grouped_conv_bwd_data 2 $bwd_data_layout $dim --instances > backward_data/profiler/nhwgc_bf16.conf + + #3D + dim=3 + $ProfilerPath grouped_conv_bwd_data 2 $bwd_data_layout $dim --instances > backward_data/profiler/ndhwgc_bf16.conf + $ProfilerPath grouped_conv_bwd_data 1 $bwd_data_layout $dim --instances > backward_data/profiler/ndhwgc_fp16.conf + $ProfilerPath grouped_conv_bwd_data 0 $bwd_data_layout $dim --instances > backward_data/profiler/ndhwgc_fp32.conf + +fi + +mkdir -p forward/tests +mkdir -p backward_weight/tests +mkdir -p backward_data/tests + +# Do not change the existing fwd test configs + +# For BWD weight, generate new test configs by taking 20% of the profiler configs for each data type and layout +for layout in nhwgc ndhwgc; do + for dtype in fp32 fp16 bf16; do + profiler_config="backward_weight/profiler/${layout}_${dtype}.conf" + test_config="backward_weight/tests/${layout}_${dtype}.conf" + awk 'NR % 5 == 0' $profiler_config > $test_config # 20% of lines in the profiler configs + done +done + +# For BWD data, generate new test configs by taking 20% of the profiler configs for each data type and layout +for layout in nhwgc ndhwgc; do + for dtype in fp32 fp16 bf16; do + profiler_config="backward_data/profiler/${layout}_${dtype}.conf" + test_config="backward_data/tests/${layout}_${dtype}.conf" + awk 'NR % 5 == 0' $profiler_config > $test_config # 20% of lines in the profiler configs + done +done \ No newline at end of file diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/forward/profiler/ndhwgc_bf16.conf similarity index 100% rename from experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_bf16.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/profiler/ndhwgc_bf16.conf diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/forward/profiler/ndhwgc_fp16.conf similarity index 100% rename from experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp16.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/profiler/ndhwgc_fp16.conf diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/forward/profiler/ndhwgc_fp32.conf similarity index 100% rename from experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp32.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/profiler/ndhwgc_fp32.conf diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/forward/profiler/nhwgc_bf16.conf similarity index 99% rename from experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_bf16.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/profiler/nhwgc_bf16.conf index 580a0a1941..15787ad3c3 100644 --- a/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_bf16.conf +++ b/experimental/grouped_convolution_tile_instances/configs/forward/profiler/nhwgc_bf16.conf @@ -261,4 +261,4 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<64, 16, 16, 64, Filte # DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> # DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 2, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> \ No newline at end of file +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 2, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/forward/profiler/nhwgc_fp16.conf similarity index 100% rename from experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp16.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/profiler/nhwgc_fp16.conf diff --git a/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/forward/profiler/nhwgc_fp32.conf similarity index 100% rename from experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp32.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/profiler/nhwgc_fp32.conf diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/forward/tests/ndhwgc_bf16.conf similarity index 100% rename from experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_bf16.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/tests/ndhwgc_bf16.conf diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/forward/tests/ndhwgc_fp16.conf similarity index 100% rename from experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp16.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/tests/ndhwgc_fp16.conf diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/forward/tests/ndhwgc_fp32.conf similarity index 100% rename from experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp32.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/tests/ndhwgc_fp32.conf diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/forward/tests/nhwgc_bf16.conf similarity index 100% rename from experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_bf16.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/tests/nhwgc_bf16.conf diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/forward/tests/nhwgc_fp16.conf similarity index 100% rename from experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp16.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/tests/nhwgc_fp16.conf diff --git a/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/forward/tests/nhwgc_fp32.conf similarity index 100% rename from experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp32.conf rename to experimental/grouped_convolution_tile_instances/configs/forward/tests/nhwgc_fp32.conf diff --git a/experimental/grouped_convolution_tile_instances/generate_instances.py b/experimental/grouped_convolution_tile_instances/generate_instances.py old mode 100644 new mode 100755 index 37c2db8a7b..ff48718c89 --- a/experimental/grouped_convolution_tile_instances/generate_instances.py +++ b/experimental/grouped_convolution_tile_instances/generate_instances.py @@ -4,7 +4,6 @@ import argparse from pathlib import Path - class ConvInstanceTemplateParams: def __init__( self, @@ -84,12 +83,37 @@ def get_dtype(problem_name): if problem_name.find("bf16") != -1: return "ck_tile::bf16_t" else: - raise RuntimeError("wrong dtype") + raise RuntimeError("Cannot parse data type from problem name: " + problem_name) +def get_k_mfma(dtype, m_per_xdl, n_per_xdl): + if m_per_xdl != n_per_xdl: + raise RuntimeError("Not supported") + if dtype == "float": + if m_per_xdl == 32: + return 2 + else: + return 4 + else: + if m_per_xdl == 32: + return 8 + else: + return 16 + +def check_vectors(a_scalar_per_vector, b_scalar_per_vector, c_scalar_per_vector): + if a_scalar_per_vector != 1 and a_scalar_per_vector % 2 != 0: + return False + if b_scalar_per_vector != 1 and b_scalar_per_vector % 2 != 0: + return False + if c_scalar_per_vector != 1 and c_scalar_per_vector % 2 != 0: + return False + return True def generate_calls_inc(instances, problem_name, direction, filter_pattern): generate_dir = Path(__file__).resolve().parent - with open(f"{generate_dir}/{problem_name}_calls.inc", "w") as f: + output_dir = Path(f"{generate_dir}/instances/{direction}") + output_dir.mkdir(parents=True, exist_ok=True) + + with open(f"{generate_dir}/instances/{direction}/{problem_name}_calls.inc", "w") as f: if problem_name.find(filter_pattern) == -1: return for instance in instances: @@ -99,7 +123,7 @@ def generate_calls_inc(instances, problem_name, direction, filter_pattern): def generate_defs_inc(instances, problem_name, signature, direction, filter_pattern): generate_dir = Path(__file__).resolve().parent - with open(f"{generate_dir}/{problem_name}.inc", "w") as f: + with open(f"{generate_dir}/instances/{direction}/{problem_name}.inc", "w") as f: if problem_name.find(filter_pattern) == -1: return for instance in instances: @@ -113,34 +137,29 @@ def generate_defs_inc(instances, problem_name, signature, direction, filter_patt ) -def generate_fwd_cpp( - instances, problem_name, config, direction, signature_name, filter_pattern -): +def generate_conv_cpp( + instances, problem_name, config, direction, signature_name, filter_pattern): for instance in instances: if problem_name.find(filter_pattern) == -1: break instance_name = problem_name + "_" + str(instance.id) generate_dir = Path(__file__).resolve().parent - directory_path = Path(f"{generate_dir}/instances/{config}") + directory_path = Path(f"{generate_dir}/instances/{direction}/{config}") directory_path.mkdir(parents=True, exist_ok=True) - with open( - f"{generate_dir}/instances/grouped_convolution_forward_tile.cpp.in", - "r", - ) as f: + template_file = "grouped_convolution_tile.cpp.in" + + with open(f"{generate_dir}/instances/{template_file}", "r",) as f: content = f.read() - content = content.replace("gen_signature", signature_name) - content = content.replace("gen_instance_name", instance_name) - content = content.replace("gen_specialization", instance.get_specialization()) - content = content.replace("gen_thread_block", instance.get_thread_block()) - content = content.replace("gen_block_gemm_desc", instance.get_block_gemm_desc()) - content = content.replace("gen_block_transfer", instance.get_block_transfer()) - content = content.replace("gen_optimizations", instance.get_optimizations()) + content = content.replace("gen_signature", signature_name) + content = content.replace("gen_instance_name", instance_name) + content = content.replace("gen_specialization", instance.get_specialization()) + content = content.replace("gen_thread_block", instance.get_thread_block()) + content = content.replace("gen_block_gemm_desc", instance.get_block_gemm_desc()) + content = content.replace("gen_block_transfer", instance.get_block_transfer()) + content = content.replace("gen_optimizations", instance.get_optimizations()) - with open( - f"{generate_dir}/instances/{config}/{instance_name}.cpp", - "w", - ) as f: + with open(f"{generate_dir}/instances/{direction}/{config}/{instance_name}.cpp","w",) as f: f.write(content) @@ -172,7 +191,7 @@ def parse_fwd_instances(instances, problem_name): num_groups_to_merge = 1 split_image = instance.find("Large") != -1 double_smem_buffer = instance.find("BlkGemmPipelineVersion: v4") != -1 - num_wave_groups = 2 if instance.find("BlkGemmPipelineVersion: v5") != -1 else 1 + num_wave_groups = 1 scheduler = ( "Intrawave" if instance.find("BlkGemmPipelineScheduler") == -1 else args[14] ) @@ -231,6 +250,189 @@ def parse_fwd_instances(instances, problem_name): convs.append(conv) return convs +def parse_instance_string(instance_string): + """Parse instance string, treating Seq(...) as a single parameter.""" + params = [] + current_param = "" + paren_depth = 0 + + for char in instance_string: + if char == '(': + paren_depth += 1 + current_param += char + elif char == ')': + paren_depth -= 1 + current_param += char + elif char == ',' and paren_depth == 0: + # Only split on comma if we're not inside parentheses + params.append(current_param.strip()) + current_param = "" + else: + current_param += char + + # Add the last parameter + if current_param.strip(): + params.append(current_param.strip()) + + return params + +def parse_bwd_weight_instances(instances, problem_name): + convs = [] + + for instance_id, instance in enumerate(instances): + if instance.find("#") != -1 or instance.find(";") != -1: + continue + + device_op_name = instance.split("<")[0] + start = instance.index('<') + 1 + end = instance.rindex('>') + params_str = instance[start:end] + args = parse_instance_string(params_str) + + is_v3_instance = instance.find("Xdl_CShuffleV3") != -1 + is_two_stage_instance = instance.find("TwoStage") != -1 + is_explicit_gemm = device_op_name.find("Explicit") != -1 + + if is_explicit_gemm: + gemm_params = device_op_name = instance.split("<")[2].split(">")[1].split(",") + args = [param.split(":")[1].strip() for param in gemm_params] + + spec = "Default" + block_size = int(args[0]) + + mnk_per_block = args[1].split("x") + m_per_block = int(mnk_per_block[0]) + n_per_block = int(mnk_per_block[1]) + k_per_block = int(mnk_per_block[2]) + + wave_tile = args[2].split("x") + m_per_xdl = int(wave_tile[0]) + n_per_xdl = int(wave_tile[1]) + + k1_values = args[3].split("x") + ak1 = int(k1_values[0]) + bk1 = int(k1_values[1]) + k1 = min(ak1, bk1) + + wave_map = args[4].split("x") + m_xdl_per_wave = int(wave_map[0]) + n_xdl_per_wave = int(wave_map[1]) + + vector_read = args[5].split("x") + a_scalar_per_vector = int(vector_read[0]) + b_scalar_per_vector = int(vector_read[1]) + c_scalar_per_vector_seq = [int(x) for x in vector_read[2].strip("Seq").strip("(").strip(")").split(",")] + + if len(set(c_scalar_per_vector_seq)) != 1: + raise RuntimeError(f"c_scalar_per_vector must be the same across all waves for instance {instance_id} with device op {device_op_name}. Found values: {c_scalar_per_vector_seq}") + + c_scalar_per_vector = c_scalar_per_vector_seq[0] + + num_groups_to_merge = 1 + + # Block GEMM pipeline parameters + blk_gemm_pipeline_schduler = args[6] + blk_gemm_pipeline_version = args[7] + else: + spec = args[11] + block_size = int(args[12]) + m_per_block = int(args[13]) + n_per_block = int(args[14]) + k1 = int(args[16]) + m_per_xdl = int(args[17]) + n_per_xdl = int(args[18]) + m_xdl_per_wave = int(args[19]) + n_xdl_per_wave = int(args[20]) + a_scalar_per_vector = int(args[25]) + b_scalar_per_vector = int(args[32]) + c_scalar_per_vector = int(args[38]) + + if is_v3_instance or is_two_stage_instance: + k_per_block = int(args[15]) + else: + k0_per_block = int(args[15]) + k_per_block = k0_per_block * k1 + + if is_v3_instance: + if len(args) != 45: + raise RuntimeError(f"Wrong number of parameters in the V3 XDL CShuffle instance string: {instance}") + + num_groups_to_merge = int(args[44]) + + # Block GEMM pipeline parameters + blk_gemm_pipeline_schduler = args[39] + blk_gemm_pipeline_version = args[40] + elif is_two_stage_instance: + print(f"Skipping instance {instance_id} with device op {device_op_name} since it's not supported yet.") + continue + else: + # Regular V1 XDL CShuffle instance + if len(args) != 43: + raise RuntimeError(f"Wrong number of parameters in the XDL CShuffle instance string: {instance}") + + num_groups_to_merge = 1 + + # Block GEMM pipeline parameters + blk_gemm_pipeline_schduler = "Intrawave" + blk_gemm_pipeline_version = "v1" + + # Common part to all solvers. + + # Sanity check for Block GEMM pipeline parameters + # Scheduler must be either Intrawave or Interwave. + # Version must be from v1 to v5 + if blk_gemm_pipeline_schduler not in ["Intrawave", "Interwave"]: + raise RuntimeError(f"Invalid Block GEMM pipeline scheduler: {blk_gemm_pipeline_schduler} in instance: {instance}") + if blk_gemm_pipeline_version not in ["v1", "v2", "v3", "v4", "v5"]: + raise RuntimeError(f"Invalid Block GEMM pipeline version: {blk_gemm_pipeline_version} in instance: {instance}") + + split_image = instance.find("Large") != -1 + double_smem_buffer = blk_gemm_pipeline_version == "v4" + num_wave_groups = 1 + scheduler = blk_gemm_pipeline_schduler + pipeline_version = blk_gemm_pipeline_version.upper() + + # OLd CK pipeline version V5 maps to V6 for CK Tile + if pipeline_version == "V5": + pipeline_version = "V6" + + m_warp = int(m_per_block / (m_per_xdl * m_xdl_per_wave)) + n_warp = int(n_per_block / (n_per_xdl * n_xdl_per_wave)) + warp_size = 64 + k_warp = int(block_size / (warp_size * m_warp * n_warp)) + dtype = get_dtype(problem_name) + + k_per_xdl = max(k1, get_k_mfma(dtype, m_per_xdl, n_per_xdl)) + + if check_vectors(a_scalar_per_vector, b_scalar_per_vector, c_scalar_per_vector) == False: + print(f"Skipping instance {instance_id} with irregular load since it's not supported yet.") + continue + + + conv = ConvInstanceTemplateParams( + spec, + [m_per_block, n_per_block, k_per_block], + [m_warp, n_warp, k_warp], + [m_per_xdl, n_per_xdl, k_per_xdl], + double_smem_buffer, + num_wave_groups, + pipeline_version, + scheduler, + [a_scalar_per_vector, b_scalar_per_vector, c_scalar_per_vector], + num_groups_to_merge, + split_image, + is_explicit_gemm, + instance_id, + ) + convs.append(conv) + + return convs + +def parse_bwd_data_instances(instances, problem_name): + convs = [] + print("Parsing backward data instances is not supported yet, skipping all instances.") + # TODO: Implement parsing logic for backward data instances. + return convs def generate_instances_fwd(instances, problem_name, config, filter_pattern): direction = "forward" @@ -244,10 +446,62 @@ def generate_instances_fwd(instances, problem_name, config, filter_pattern): direction, filter_pattern, ) - generate_fwd_cpp( + generate_conv_cpp( instances, problem_name, config, direction, signature_name, filter_pattern ) +def generate_instances_bwd_weight(instances, problem_name, config, filter_pattern): + direction = "backward_weight" + signature_name = f"SIGNATURE_{config.upper()}_BWD_WEIGHT" + instances = parse_bwd_weight_instances(instances, problem_name) + generate_calls_inc(instances, problem_name, direction, filter_pattern) + generate_defs_inc( + instances, + problem_name, + signature_name, + direction, + filter_pattern, + ) + generate_conv_cpp( + instances, problem_name, config, direction, signature_name, filter_pattern + ) + +def generate_instances_bwd_data(instances, problem_name, config, filter_pattern): + direction = "backward_data" + signature_name = f"SIGNATURE_{config.upper()}_BWD_DATA" + instances = parse_bwd_data_instances(instances, problem_name) + generate_calls_inc(instances, problem_name, direction, filter_pattern) + generate_defs_inc( + instances, + problem_name, + signature_name, + direction, + filter_pattern, + ) + generate_conv_cpp( + instances, problem_name, config, direction, signature_name, filter_pattern + ) + +def process_direction(configs, direction, generate_func, configs_prefix, filter_pattern): + """Helper function to process a single direction.""" + for config in configs: + instances = [] + generate_dir = Path(__file__).resolve().parent + config_path = f"{generate_dir}/configs/{direction}/{configs_prefix}/{config}.conf" + with open(config_path, "r") as file: + instances = file.readlines() + + # Determine problem name based on direction + if direction == "forward": + problem_name = f"grouped_convolution_forward_tile_{config}" + elif direction == "backward_weight": + problem_name = f"grouped_convolution_backward_weight_tile_{config}" + elif direction == "backward_data": + problem_name = f"grouped_convolution_backward_data_tile_{config}" + else: + raise RuntimeError(f"Unknown direction: {direction}") + + generate_func(instances, problem_name, config, filter_pattern) if __name__ == "__main__": fwd_configs = [ @@ -259,6 +513,25 @@ if __name__ == "__main__": "ndhwgc_bf16", ] + # FP32 doesn't work for bwd weigth currently + bwd_weight_configs = [ + "nhwgc_fp32", + "nhwgc_fp16", + "nhwgc_bf16", + "ndhwgc_fp32", + "ndhwgc_fp16", + "ndhwgc_bf16", + ] + + bwd_data_configs = [ + "nhwgc_fp32", + "nhwgc_fp16", + "nhwgc_bf16", + "ndhwgc_fp32", + "ndhwgc_fp16", + "ndhwgc_bf16", + ] + parser = argparse.ArgumentParser( description="Generate grouped conv CK Tile instances." ) @@ -275,6 +548,13 @@ if __name__ == "__main__": default="profiler", help="Generator modes. compilation - empty instance list, tests - limited instance list, profiler - generate all instances", ) + parser.add_argument( + "--direction", + choices=["forward", "backward_weight", "backward_data", "all"], + type=str, + default="all", + help="Convolution direction for which to generate instances." + ) args = parser.parse_args() # apply empty filter @@ -288,11 +568,15 @@ if __name__ == "__main__": else: raise RuntimeError("wrong mode") - for config in fwd_configs: - instances = [] - generate_dir = Path(__file__).resolve().parent - config_path = f"{generate_dir}/configs/{configs_prefix}/{config}.conf" - with open(config_path, "r") as file: - instances = file.readlines() - problem_name = f"grouped_convolution_forward_tile_{config}" - generate_instances_fwd(instances, problem_name, config, args.filter_pattern) + match args.direction: + case "forward": + process_direction(fwd_configs, args.direction, generate_instances_fwd, configs_prefix, args.filter_pattern) + case "backward_weight": + process_direction(bwd_weight_configs, args.direction, generate_instances_bwd_weight, configs_prefix, args.filter_pattern) + case "backward_data": + process_direction(bwd_data_configs, args.direction, generate_instances_bwd_data, configs_prefix, args.filter_pattern) + case "all": + process_direction(fwd_configs, "forward", generate_instances_fwd, configs_prefix, args.filter_pattern) + process_direction(bwd_weight_configs, "backward_weight", generate_instances_bwd_weight, configs_prefix, args.filter_pattern) + process_direction(bwd_data_configs, "backward_data", generate_instances_bwd_data, configs_prefix, args.filter_pattern) + diff --git a/experimental/grouped_convolution_tile_instances/instances/grouped_convolution_forward_tile.cpp.in b/experimental/grouped_convolution_tile_instances/instances/grouped_convolution_tile.cpp.in similarity index 93% rename from experimental/grouped_convolution_tile_instances/instances/grouped_convolution_forward_tile.cpp.in rename to experimental/grouped_convolution_tile_instances/instances/grouped_convolution_tile.cpp.in index 7e86576f7b..c05d520b29 100644 --- a/experimental/grouped_convolution_tile_instances/instances/grouped_convolution_forward_tile.cpp.in +++ b/experimental/grouped_convolution_tile_instances/instances/grouped_convolution_tile.cpp.in @@ -1,6 +1,6 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "../instance_includes.inc" +#include "../../instance_includes.inc" namespace ck_tile::builder::profiling { constexpr auto SIGNATURE = gen_signature; std::tuple run_gen_instance_name(const ckt::Args& args, @@ -14,6 +14,6 @@ std::tuple run_gen_instance_name(const ckt::Args list[str]: + """Get the include directories needed for compilation.""" + return [ + str(project_root / "build" / "include"), + str(project_root / "include"), + str(project_root / "library" / "include"), + str(project_root / "experimental" / "builder" / "include"), + str(project_root / "experimental" / "builder" / "test" / "utils"), + str(project_root / "experimental" / "grouped_convolution_tile_instances" / "instances"), + ] + +def compile_single_file(cpp_file: Path, project_root: Path, gpu_target: str, verbose: bool) -> tuple[bool, str]: + """ + Attempt to compile a single .cpp file. + Returns (success, error_message). + """ + include_dirs = get_include_dirs(project_root) + include_flags = [f"-I{d}" for d in include_dirs] + + # Create a temporary directory for output + with tempfile.TemporaryDirectory() as tmpdir: + output_file = Path(tmpdir) / "output.o" + + cmd = [ + CXX_COMPILER, + "-c", # Compile only, don't link + f"-std=c++{CXX_STANDARD}", + f"--offload-arch={gpu_target}", + "-D__HIP_PLATFORM_AMD__", + "-D CK_EXPERIMENTAL_BUILDER=ON", + "-O3", + *include_flags, + str(cpp_file), + "-o", str(output_file) + ] + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300 # 5 minute timeout per file + ) + + if result.returncode == 0: + return True, "" + else: + # Extract the key error message + error_output = result.stderr + return False, error_output + + except subprocess.TimeoutExpired: + return False, "TIMEOUT: Compilation took too long" + except Exception as e: + return False, f"EXCEPTION: {str(e)}" + +def extract_key_error(error_output: str) -> str: + """Extract the most relevant error message from compiler output.""" + lines = error_output.split('\n') + for line in lines: + if 'error:' in line: + return line.strip() + # Return first non-empty line if no explicit error found + for line in lines: + if line.strip(): + return line.strip()[:200] # Limit length + return "Unknown error" + +def find_instance_files(instances_dir: Path, direction: str = "backward_weight") -> dict[str, list[Path]]: + """ + Find all instance .cpp files grouped by subdirectory (layout/datatype). + Returns dict: subdirectory_name -> list of cpp files + """ + target_dir = instances_dir / direction + if not target_dir.exists(): + print(f"Error: Directory {target_dir} does not exist") + sys.exit(1) + + files_by_subdir = defaultdict(list) + + for subdir in sorted(target_dir.iterdir()): + if subdir.is_dir(): + cpp_files = sorted(subdir.glob("*.cpp")) + if cpp_files: + files_by_subdir[subdir.name] = cpp_files + + return files_by_subdir + +def parse_args(): + parser = argparse.ArgumentParser(description="Check which convolution instances compile") + parser.add_argument("--direction", default="backward_weight", + choices=["forward", "backward_weight", "backward_data"], + help="Convolution direction to check") + parser.add_argument("--subdir", default=None, + help="Only check specific subdirectory (e.g., 'nhwgc_bf16')") + parser.add_argument("--max-files", type=int, default=None, + help="Maximum number of files to check per subdirectory") + parser.add_argument("--verbose", "-v", action="store_true", + help="Show verbose output including compile commands") + parser.add_argument("--output", "-o", default=None, + help="Output file for the blacklist") + parser.add_argument("--project-root", default=None, + help="Project root directory (auto-detected if not specified)") + parser.add_argument("--instance", type=int, default=None, + help="Only check a single instance by its index in the config file.") + parser.add_argument( + "--parallel-jobs", + "-j", + type=int, + default=1, + help="Number of parallel compilation jobs (default: 1)", + ) + parser.add_argument("--gpu-target", type=str, default="gfx950", help="GPU target architecture (default: gfx950)") + + args = parser.parse_args() + + return args + +def main(): + + args = parse_args() + + # Find project root + if args.project_root: + project_root = Path(args.project_root) + else: + # Assume script is in experimental/grouped_convolution_tile_instances/ + script_dir = Path(__file__).resolve().parent + project_root = script_dir.parent.parent # Go up to composablekernel/ + + instances_dir = project_root / "experimental" / "grouped_convolution_tile_instances" / "instances" + + print(f"Project root: {project_root}") + print(f"Instances directory: {instances_dir}") + print(f"Compiler: {CXX_COMPILER}") + print(f"GPU Target: {args.gpu_target}") + print(f"Direction: {args.direction}") + if args.instance is not None: + print(f"Checking only instance index: {args.instance}") + print() + + # Find all instance files + files_by_subdir = find_instance_files(instances_dir, args.direction) + + # If sub directory is defined, check only that sub directory + if args.subdir: + if args.subdir not in files_by_subdir: + print(f"Error: Subdirectory '{args.subdir}' not found") + print(f"Available: {list(files_by_subdir.keys())}") + sys.exit(1) + files_by_subdir = {args.subdir: files_by_subdir[args.subdir]} + + if args.instance is not None: + # If instance index is specified, find the corresponding file for each subdir + instance_files = {} + for subdir, files in files_by_subdir.items(): + if args.instance >= 0: + target_suffix = f"_{args.instance}.cpp" + matched_files = [f for f in files if f.name.endswith(target_suffix)] + if matched_files: + assert len(matched_files) == 1, f"Expected exactly one file ending with {target_suffix} in {subdir}, found {len(matched_files)}" + instance_files[subdir] = matched_files + else: + if args.subdir is None: + print(f"Warning: Subdirectory '{subdir}' does not have instance index {args.instance}") + files_by_subdir = instance_files + + if args.subdir: + if args.instance is not None and args.subdir not in files_by_subdir: + print(f"Instance index {args.instance} was not found in subdirectory '{args.subdir}'") + sys.exit(1) + elif args.subdir not in files_by_subdir: + print(f"Error: Subdirectory '{args.subdir}' not found") + print(f"Available: {list(files_by_subdir.keys())}") + sys.exit(1) + files_by_subdir = {args.subdir: files_by_subdir[args.subdir]} + + # Track results + all_failures = defaultdict(list) # subdir -> list of (filename, error) + all_successes = defaultdict(list) # subdir -> list of filenames + error_types = defaultdict(set) # error_key -> set of files + + total_files = sum(len(files) for files in files_by_subdir.values()) + if args.max_files: + total_files = min(total_files, args.max_files * len(files_by_subdir)) + + print(f"Found {total_files} instance files to check") + print("=" * 60) + + checked = 0 + for subdir_name, cpp_files in sorted(files_by_subdir.items()): + print(f"\nChecking {subdir_name}...", flush=True) + + files_to_check = cpp_files + if args.max_files: + files_to_check = cpp_files[:args.max_files] + + if args.parallel_jobs > 1: + # Parallel compilation + with ThreadPoolExecutor(max_workers=args.parallel_jobs) as executor: + # Submit all compilation jobs + futures = { + executor.submit(compile_single_file, cpp_file, project_root, args.gpu_target, args.verbose): cpp_file + for cpp_file in files_to_check + } + + # Process results as they complete + for future in as_completed(futures): + cpp_file = futures[future] + filename = cpp_file.name + checked += 1 + success, error = future.result() + + if success: + print(f" [{checked}/{total_files}] {filename}... OK", flush=True) + all_successes[subdir_name].append(filename) + else: + key_error = extract_key_error(error) + print(f" [{checked}/{total_files}] {filename}... FAILED", flush=True) + if args.verbose: + print(f" Error: {key_error}") + all_failures[subdir_name].append((filename, key_error)) + error_types[key_error].add(f"{subdir_name}/{filename}") + else: + # Sequential compilation + print(f"Compiling {len(files_to_check)} files sequentially...") + for cpp_file in files_to_check: + checked += 1 + filename = cpp_file.name + + if args.verbose: + print(f" [{checked}/{total_files}] {filename}...", end=" ", flush=True) + else: + print(f" [{checked}/{total_files}] {filename}...", end=" ", flush=True) + + success, error = compile_single_file(cpp_file, project_root, args.gpu_target, args.verbose) + + if success: + print("OK") + all_successes[subdir_name].append(filename) + else: + key_error = extract_key_error(error) + print(f"FAILED") + if args.verbose: + print(f" Error: {key_error}") + all_failures[subdir_name].append((filename, key_error)) + error_types[key_error].add(f"{subdir_name}/{filename}") + + # Print summary + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + + for subdir_name in sorted(files_by_subdir.keys()): + successes = len(all_successes.get(subdir_name, [])) + failures = len(all_failures.get(subdir_name, [])) + total = successes + failures + print(f"\n{subdir_name}: {successes}/{total} passed, {failures} failed") + + if failures > 0: + print(f" Failed files:") + # Order the failures by the filename for consistency + # Each filename ends with _{instance_index}.cpp, so we can sort by instance index + sorted_failures = sorted( + all_failures[subdir_name], + key=lambda x: int(re.search(r'_(\d+)\.cpp$', x[0]).group(1)) + if re.search(r'_(\d+)\.cpp$', x[0]) else 0 + ) + for filename, error in sorted_failures: + print(f" - {filename}") + + # Return exit code based on failures + total_failures = sum(len(f) for f in all_failures.values()) + if total_failures > 0: + print(f"\n{total_failures} total failures found") + return 1 + else: + print("\nAll instances compiled successfully!") + return 0 + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp index 94f8075c7f..cd635638b6 100644 --- a/include/ck_tile/core/algorithm/static_encoding_pattern.hpp +++ b/include/ck_tile/core/algorithm/static_encoding_pattern.hpp @@ -150,7 +150,7 @@ struct tile_distribution_encoding_pattern_2d LargestVec ? LargestVec : VecSize; - static constexpr index_t X0 = XPerTile / X1; // # of threads in X dim + static constexpr index_t X0 = min(warp_size, XPerTile / X1); // # of threads in X dim // # of rows in Y dim accessed by single wavefront in one iteration static constexpr index_t Y1 = warp_size / X0; @@ -234,7 +234,7 @@ struct tile_distribution_encoding_pattern_2d LargestVec ? LargestVec : VecSize; - static constexpr index_t X0 = XPerTile / X1; // # of threads in X dim + static constexpr index_t X0 = min(warp_size, XPerTile / X1); // # of threads in X dim static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!"); @@ -289,7 +289,7 @@ struct tile_distribution_encoding_pattern_2d LargestVec ? LargestVec : VecSize; - static constexpr index_t X0 = XPerTile / X1; // # of threads in X dim + static constexpr index_t X0 = min(warp_size, XPerTile / X1); // # of threads in X dim static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!"); static constexpr index_t Y1 = num_warps; diff --git a/include/ck_tile/core/utility/env.hpp b/include/ck_tile/core/utility/env.hpp index 1fcae8835e..2819e57708 100644 --- a/include/ck_tile/core/utility/env.hpp +++ b/include/ck_tile/core/utility/env.hpp @@ -16,7 +16,15 @@ void CK_TILE_ERROR(Args&&... args) noexcept { std::ostringstream oss; (oss << ... << args); - std::cerr << "[ERROR] " << oss.str() << std::endl; + std::cerr << "[CK_TILE_ERROR] " << oss.str() << std::endl; +} + +template +void CK_TILE_INFO(Args&&... args) noexcept +{ + std::ostringstream oss; + (oss << ... << args); + std::cout << "[CK_TILE_INFO] " << oss.str() << std::endl; } namespace internal { diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp index 0af69ff1a5..66bef8d3ca 100644 --- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp +++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp @@ -18,9 +18,11 @@ struct BlockGemmASmemBSmemCRegV1DefaultPolicy { #if defined(__gfx950__) constexpr bool is_a_load_tr = std::is_same_v, - tensor_layout::gemm::ColumnMajor>; + tensor_layout::gemm::ColumnMajor> && + !std::is_same_v; constexpr bool is_b_load_tr = std::is_same_v, - tensor_layout::gemm::RowMajor>; + tensor_layout::gemm::RowMajor> && + !std::is_same_v; #else constexpr bool is_a_load_tr = false; constexpr bool is_b_load_tr = false; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp index 2a0c09e41f..45bf30b576 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp @@ -53,7 +53,9 @@ struct GemmPipelineAgBgCrImplBase using WarpTile = typename BlockGemmShape::WarpTile; constexpr index_t kKWarpTile = WarpTile::at(number<2>{}); constexpr index_t kMaxKWarpTile = (sizeof(ADataType) == 1) ? 64 : 32; - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) + return false; + else if constexpr(std::is_same_v) return false; else if constexpr(kKWarpTile > kMaxKWarpTile) return false; @@ -65,7 +67,9 @@ struct GemmPipelineAgBgCrImplBase using WarpTile = typename BlockGemmShape::WarpTile; constexpr index_t kKWarpTile = WarpTile::at(number<2>{}); constexpr index_t kMaxKWarpTile = (sizeof(BDataType) == 1) ? 64 : 32; - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) + return false; + else if constexpr(std::is_same_v) return false; else if constexpr(kKWarpTile > kMaxKWarpTile) return false; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp index 1285cc8cee..92bb0ce0b8 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp @@ -49,7 +49,9 @@ struct UniversalGemmBasePolicy constexpr index_t kKWarpTile = WarpTile::at(number<2>{}); // Max K warp tile for transpose load based on data type size constexpr index_t kMaxKWarpTile = (sizeof(ADataType) == 1) ? 64 : 32; - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) + return false; + else if constexpr(std::is_same_v) return false; else if constexpr(kKWarpTile > kMaxKWarpTile) return false; @@ -65,7 +67,9 @@ struct UniversalGemmBasePolicy constexpr index_t kKWarpTile = WarpTile::at(number<2>{}); // Max K warp tile for transpose load based on data type size constexpr index_t kMaxKWarpTile = (sizeof(BDataType) == 1) ? 64 : 32; - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) + return false; + else if constexpr(std::is_same_v) return false; else if constexpr(kKWarpTile > kMaxKWarpTile) return false; diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp index 43f17c1a56..6c25050c9c 100644 --- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp +++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp @@ -38,6 +38,7 @@ template<> struct Dispatcher { using Typ template<> struct Dispatcher { using Type = WarpGemmMfmaF32F32F32M16N16K8<>; }; template<> struct Dispatcher { using Type = WarpGemmMfmaF32F32F32M32N32K4<>; }; template<> struct Dispatcher { using Type = WarpGemmMfmaF32F32F32M32N32K8<>; }; +template<> struct Dispatcher { using Type = WarpGemmMfmaF32F32F32M32N32K8; }; template<> struct Dispatcher { using Type = WarpGemmMfmaF32F32F32M16N16K16TransposedCDistribution<>; }; // fp16 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity diff --git a/include/ck_tile/ops/grouped_convolution.hpp b/include/ck_tile/ops/grouped_convolution.hpp index eeb9b1d8a8..3c7b00782f 100644 --- a/include/ck_tile/ops/grouped_convolution.hpp +++ b/include/ck_tile/ops/grouped_convolution.hpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: MIT #pragma once +#include "ck_tile/ops/grouped_convolution/pipeline/grouped_conv_universal_pipeline_ag_bg_cr_policy.hpp" #include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp" #include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp" #include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp" diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp index 143c003784..39c7ba1370 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp @@ -22,6 +22,15 @@ namespace ck_tile { +template +CK_TILE_HOST void LogInfo(Args&&... args) noexcept +{ + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_INFO(std::forward(args)...); + } +} + /// @brief The Grouped Convolution kernel device arguments. template struct GroupedConvBwdWeightKernelArgs @@ -106,13 +115,18 @@ struct GroupedConvBwdWeightKernelArgs k_batch = args.k_batch; - if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) - { - std::cout << "GemmM: " << GemmM << ", GemmN: " << GemmN << ", GemmK: " << GemmK - << ", GemmBatch: " << GemmBatch - << ", NumGroupsPerBatch: " << NumGroupsPerBatch << ", k_batch: " << k_batch - << std::endl; - } + LogInfo("GemmM: ", + GemmM, + ", GemmN: ", + GemmN, + ", GemmK: ", + GemmK, + ", GemmBatch: ", + GemmBatch, + ", NumGroupsPerBatch: ", + NumGroupsPerBatch, + ", k_batch: ", + k_batch); } template < @@ -192,13 +206,18 @@ struct GroupedConvBwdWeightKernelArgs k_batch = args.k_batch; - if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) - { - std::cout << "GemmM: " << GemmM << ", GemmN: " << GemmN << ", GemmK: " << GemmK - << ", GemmBatch: " << GemmBatch - << ", NumGroupsPerBatch: " << NumGroupsPerBatch << ", k_batch: " << k_batch - << std::endl; - } + LogInfo("GemmM: ", + GemmM, + ", GemmN: ", + GemmN, + ", GemmK: ", + GemmK, + ", GemmBatch: ", + GemmBatch, + ", NumGroupsPerBatch: ", + NumGroupsPerBatch, + ", k_batch: ", + k_batch); } template < @@ -285,13 +304,18 @@ struct GroupedConvBwdWeightKernelArgs k_batch = args.k_batch; - if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) - { - std::cout << "GemmM: " << GemmM << ", GemmN: " << GemmN << ", GemmK: " << GemmK - << ", GemmBatch: " << GemmBatch - << ", NumGroupsPerBatch: " << NumGroupsPerBatch << ", k_batch: " << k_batch - << std::endl; - } + LogInfo("GemmM: ", + GemmM, + ", GemmN: ", + GemmN, + ", GemmK: ", + GemmK, + ", GemmBatch: ", + GemmBatch, + ", NumGroupsPerBatch: ", + NumGroupsPerBatch, + ", k_batch: ", + k_batch); } using ABCGridDescs = remove_cvref_t< @@ -474,12 +498,12 @@ struct GroupedConvolutionBackwardWeightKernel CK_TILE_HOST static constexpr GroupedConvBwdWeightKernelArgsSpecialized MakeKernelArgs(const GroupedConvBwdWeightHostArgs& hostArgs) { - if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) - { - std::cout << "MPerBlock: " << number{} << std::endl; - std::cout << "NPerBlock: " << number{} << std::endl; - std::cout << "KPerBlock: " << number{} << std::endl; - } + LogInfo("MPerBlock: ", + number{}, + ", NPerBlock: ", + number{}, + ", KPerBlock: ", + number{}); auto kernel_args = GroupedConvBwdWeightKernelArgsSpecialized(hostArgs); @@ -517,11 +541,7 @@ struct GroupedConvolutionBackwardWeightKernel } if(kargs.k_batch < 1) { - if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) - { - CK_TILE_ERROR( - "k_batch must be at least one. Ensure argument is created via MakeKernelArgs."); - } + LogInfo("k_batch must be at least one. Ensure argument is created via MakeKernelArgs."); return false; } @@ -533,12 +553,8 @@ struct GroupedConvolutionBackwardWeightKernel // accuracy issues. Hence, we limit the maximum split-K value to 128 in such cases. if(kargs.k_batch > 128) { - if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) - { - CK_TILE_ERROR( - "For epilogue output data type that is not float/double, we must have " + LogInfo("For epilogue output data type that is not float/double, we must have " "k_batch <= 128."); - } return false; } } @@ -548,20 +564,24 @@ struct GroupedConvolutionBackwardWeightKernel { if(kargs.k_batch != 1) { - if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) - { - CK_TILE_ERROR("Conditions not met for K_batch > 1!"); - } + LogInfo("Conditions not met for K_batch > 1: VectorSizeC must be a multiple of 2 " + "for fp16/bf16 when K_batch > 1.", + "Now k_batch is ", + kargs.k_batch, + ", VectorSizeC is ", + GroupedConvTraitsType_::VectorSizeC); return false; } } if(kargs.GemmK < TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch) { - if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) - { - CK_TILE_ERROR("KBatch is too large, part of GPU wouldn't be utilized!"); - } + LogInfo("KBatch is too large, part of GPU wouldn't be utilized! GemmK: ", + kargs.GemmK, + ", BlockGemmShape K: ", + TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}), + ", k_batch: ", + kargs.k_batch); return false; } @@ -581,6 +601,17 @@ struct GroupedConvolutionBackwardWeightKernel if(!(SpatialDim == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0)) { + LogInfo("For Filter1x1Stride1Pad0 specialization, all spatial dimensions must " + "be 1, stride must be 1, and padding must be 0. Now for dimension ", + i, + ": SpatialDim is ", + SpatialDim, + ", ConvStride is ", + ConvStride, + ", LeftPad is ", + LeftPad, + ", RightPad is ", + RightPad); return false; } } @@ -596,6 +627,15 @@ struct GroupedConvolutionBackwardWeightKernel if(!(SpatialDim == 1 && LeftPad == 0 && RightPad == 0)) { + LogInfo("For Filter1x1Pad0 specialization, all spatial dimensions must be 1 " + "and padding must be 0. Now for dimension ", + i, + ": SpatialDim is ", + SpatialDim, + ", LeftPad is ", + LeftPad, + ", RightPad is ", + RightPad); return false; } } @@ -604,6 +644,7 @@ struct GroupedConvolutionBackwardWeightKernel { if(ConvC != 1) { + LogInfo("For Filter3x3 specialization, ConvC must be 1. Now ConvC is ", ConvC); return false; } for(index_t i = 0; i < NDimSpatial; ++i) @@ -612,6 +653,11 @@ struct GroupedConvolutionBackwardWeightKernel if(filter_spatial_dim != I3) { + LogInfo("For Filter3x3 specialization, all spatial dimensions of the filter " + "must be 3. Now for dimension ", + i, + ", filter_spatial_dim is ", + filter_spatial_dim); return false; } } @@ -620,8 +666,7 @@ struct GroupedConvolutionBackwardWeightKernel if constexpr(GroupedConvTraitsType_::ExplicitGemm && ConvSpecialization != ConvolutionSpecialization::Filter1x1Stride1Pad0) { - CK_TILE_ERROR( - "Explicit Gemm is supported only for Filter1x1Stride1Pad0 specialization!"); + LogInfo("ExplicitGemm is only supported for Filter1x1Stride1Pad0 specialization."); return false; } @@ -633,14 +678,16 @@ struct GroupedConvolutionBackwardWeightKernel // Check access per C if(ConvC % GroupedConvTraitsType_::VectorSizeB != 0) { - CK_TILE_ERROR("Conv C is not a multiple of vector load size for " - "input image!"); + LogInfo("Conv C is not a multiple of vector load size for input! ConvC: ", + ConvC, + ", VectorSizeB: ", + GroupedConvTraitsType_::VectorSizeB); return false; } } else { - CK_TILE_ERROR("Not supported input layout!"); + LogInfo("Not supported input layout! Now InLayout is ", InLayout::name); return false; } @@ -650,13 +697,16 @@ struct GroupedConvolutionBackwardWeightKernel { if(ConvC % GroupedConvTraitsType_::VectorSizeC != 0) { - CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!"); + LogInfo("Conv C is not a multiple of vector load size for weight! ConvC: ", + ConvC, + ", VectorSizeC: ", + GroupedConvTraitsType_::VectorSizeC); return false; } } else { - CK_TILE_ERROR("Not supported weight layout!"); + LogInfo("Not supported weight layout! Now WeiLayout is ", WeiLayout::name); return false; } @@ -666,14 +716,16 @@ struct GroupedConvolutionBackwardWeightKernel { if(ConvK % GroupedConvTraitsType_::VectorSizeA != 0) { - CK_TILE_ERROR("Conv K is not a multiple of vector store size " - "for output image!"); + LogInfo("Conv K is not a multiple of vector load size for output! ConvK: ", + ConvK, + ", VectorSizeA: ", + GroupedConvTraitsType_::VectorSizeA); return false; } } else { - CK_TILE_ERROR("Not supported output layout!"); + LogInfo("Not supported output layout! Now OutLayout is ", OutLayout::name); return false; } @@ -682,7 +734,10 @@ struct GroupedConvolutionBackwardWeightKernel const index_t ConvG = kargs.wei_g_k_c_xs_lengths[number<0>{}]; if(ConvG % GroupedConvTraitsType_::NumGroupsToMerge != 0) { - CK_TILE_ERROR("ConvG must be a multiple of NumGroupsToMerge!"); + LogInfo("Number of groups must be divisible by NumGroupsToMerge! ConvG: ", + ConvG, + ", NumGroupsToMerge: ", + GroupedConvTraitsType_::NumGroupsToMerge); return false; } } diff --git a/include/ck_tile/ops/grouped_convolution/pipeline/grouped_conv_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/grouped_convolution/pipeline/grouped_conv_universal_pipeline_ag_bg_cr_policy.hpp new file mode 100644 index 0000000000..d989ae03eb --- /dev/null +++ b/include/ck_tile/ops/grouped_convolution/pipeline/grouped_conv_universal_pipeline_ag_bg_cr_policy.hpp @@ -0,0 +1,208 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp" +#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" + +namespace ck_tile { + +// UniversalGemm Policy +struct GroupedConvUniversalPipelineAgBgCrPolicy + : public UniversalGemmBasePolicy +{ + + template > + CK_TILE_DEVICE static constexpr auto MakeALdsBlockDescriptor() + { + using ADataType = OverrideADataType; + constexpr index_t MPerBlock = Problem::BlockGemmShape::kM; + constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; + constexpr index_t KPack = GetSmemPackA(); + + if constexpr(is_a_load_tr) + { + // TODO: better lds descriptor for performance + constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor( // + make_tuple(number{}, number{}), + make_tuple(number{}, number<1>{}), + number{}, + number<1>{}); + return a_lds_block_desc_0; + } + else + { + constexpr auto DataTypeSize = sizeof(ADataType); + constexpr uint64_t MinLdsLayer = 1ULL; + constexpr auto MLdsLayer = + max(MinLdsLayer, + get_n_lds_banks() * get_n_dwords_per_128b() / KPerBlock / DataTypeSize); + + constexpr index_t NBanks = get_n_lds_banks(); + static_assert(NBanks == 32 || NBanks == 64, "Unexpected LDS bank count"); + constexpr index_t RowMul = (NBanks == 64) ? 2 : 1; + + constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple(number{}, + number{}, + number{}), + make_tuple(number{}, number{}, number<1>{}), + number{}, + number<1>{}); + + constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor( + a_lds_block_desc_0, + make_tuple(make_xor_transform(make_tuple(number{}, + number{})), + make_pass_through_transform(number{})), + make_tuple(sequence<1, 0>{}, sequence<2>{}), + make_tuple(sequence<1, 0>{}, sequence<2>{})); + + constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor( + a_lds_block_desc_permuted, + make_tuple(make_unmerge_transform( + make_tuple(number{}, number{})), + make_pass_through_transform(number{}), + make_pass_through_transform(number{})), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}), + make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{})); + + constexpr auto a_lds_block_desc = transform_tensor_descriptor( + a_lds_block_desc_xk0_mnldslayer_mn_xk1, + make_tuple(make_merge_transform_v3_division_mod( + make_tuple(number{}, number{})), + make_merge_transform_v3_division_mod( + make_tuple(number{}, number{}))), + make_tuple(sequence<1, 0>{}, sequence<2, 3>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + + return a_lds_block_desc; + } + } + + /** + * @brief Create LDS block descriptor for B tensor. + * + * @tparam Problem Gemm pipeline problem. + * @return B tensor LDS block descriptor. + */ + template + CK_TILE_DEVICE static constexpr auto MakeBLdsBlockDescriptor() + { + constexpr bool IsBCastPolicyBeforeLDSWrite = IsBCastPolicyBeforeLDSWrite_v; + using BDataType = std::conditional_t; + + constexpr index_t NPerBlock = Problem::BlockGemmShape::kN; + constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; + + if constexpr(is_b_load_tr) + { + // TODO: better lds descriptor for performance + constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor( // + make_tuple(number{}, number{}), + make_tuple(number{}, number<1>{}), + number{}, + number<1>{}); + return b_lds_block_desc_0; + } + else + { + constexpr index_t KPack = GetSmemPackB(); + constexpr auto BK0 = number{}; + constexpr auto DataTypeSize = sizeof(BDataType); + constexpr uint64_t MinLdsLayer = 1ULL; + constexpr auto NLdsLayer = + max(MinLdsLayer, + get_n_lds_banks() * get_n_dwords_per_128b() / KPerBlock / DataTypeSize); + + constexpr index_t NBanks = get_n_lds_banks(); + static_assert(NBanks == 32 || NBanks == 64, "Unexpected LDS bank count"); + constexpr index_t RowMul = (NBanks == 64) ? 2 : 1; + + constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor( + make_tuple( + BK0 * number{}, number{}, number{}), + make_tuple(number{}, number{}, number<1>{}), + number{}, + number<1>{}); + + constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor( + b_lds_block_desc_0, + make_tuple(make_xor_transform(make_tuple(number{}, + BK0 * number{})), + make_pass_through_transform(number{})), + make_tuple(sequence<1, 0>{}, sequence<2>{}), + make_tuple(sequence<1, 0>{}, sequence<2>{})); + + constexpr auto b_lds_block_desc_bk0_nldslayer_n_bk1 = transform_tensor_descriptor( + b_lds_block_desc_permuted, + make_tuple(make_unmerge_transform(make_tuple(number{}, BK0)), + make_pass_through_transform(number{}), + make_pass_through_transform(number{})), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}), + make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{})); + + constexpr auto b_lds_block_desc = transform_tensor_descriptor( + b_lds_block_desc_bk0_nldslayer_n_bk1, + make_tuple(make_merge_transform_v3_division_mod( + make_tuple(number{}, number{})), + make_merge_transform_v3_division_mod(make_tuple(BK0, number{}))), + make_tuple(sequence<1, 0>{}, sequence<2, 3>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + return b_lds_block_desc; + } + } + + template + CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm() + { + using BlockWarps = typename Problem::BlockGemmShape::BlockWarps; + using WarpTile = typename Problem::BlockGemmShape::WarpTile; + + constexpr index_t vector_size = + DS_READ_TR_SIZE() / sizeof(typename Problem::ComputeDataType); + constexpr index_t thread_elements = WarpTile::at(I1) * WarpTile::at(I2) / get_warp_size(); + constexpr auto wg_attr_num_access = + !(is_a_load_tr || is_b_load_tr) ? WGAttrNumAccessEnum::Single + : vector_size == thread_elements ? WGAttrNumAccessEnum::Single + : vector_size * 2 == thread_elements ? WGAttrNumAccessEnum::Double + : vector_size * 4 == thread_elements ? WGAttrNumAccessEnum::Quad + : WGAttrNumAccessEnum::Invalid; + + using ADataType = remove_cvref_t; + using BDataType = remove_cvref_t; + using ATypeToUse = + std::conditional_t, BDataType, ADataType>; + using BTypeToUse = std::conditional_t || + std::is_same_v || + sizeof(BDataType) < sizeof(ADataType), + ADataType, + BDataType>; + + using WarpGemm = WarpGemmDispatcher; + + using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy; + return BlockUniversalGemmAsBsCr{}; + } +}; + +} // namespace ck_tile diff --git a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp new file mode 100644 index 0000000000..bd9c755e58 --- /dev/null +++ b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp @@ -0,0 +1,229 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" +#include "grouped_convolution_signatures.hpp" +#include "ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp" + +#include "ck_tile/builder/testing/filter_extent.hpp" +#include "ck_tile/builder/testing/conv/fwd.hpp" +#include "ck_tile/builder/testing/conv/ck_tile.hpp" +#include "ck_tile/builder/testing/conv/reference.hpp" +#include "ck_tile/builder/conv_builder.hpp" + +namespace ck_tile::builder::profiling { + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; + +#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_ndhwgc_fp32.inc" +#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_nhwgc_fp32.inc" +#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_nhwgc_bf16.inc" +#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_nhwgc_fp16.inc" +#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_ndhwgc_bf16.inc" +#include "../../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_ndhwgc_fp16.inc" + +std::vector get_split_k_values(const std::string& split_k) +{ + std::vector split_k_list = {/*auto deduce value*/ -1, 1, 2, 4, 8, 16, 32, 64, 128}; + + if(split_k != "all") + { + try + { + int split_k_value = std::stoi(split_k); + split_k_list = {split_k_value}; + } + catch(const std::exception& e) + { + std::cerr << e.what() << '\n'; + exit(EXIT_FAILURE); + } + } + return split_k_list; +} + +template +void run_cpu_validation(const ckt::Args& args, + const ckt::Outputs& outputs, + const ckt::Outputs& reference) +{ + using DataType = + std::conditional_t>; + const auto conv_param = args.to_ck_tile_conv_param(); + + const std::size_t weight_bytes_num = conv_param.template GetWeightByte(); + std::vector wei(weight_bytes_num / sizeof(DataType)); + std::vector ref(weight_bytes_num / sizeof(DataType)); + HIP_CHECK_ERROR( + hipMemcpy(&ref.data()[0], reference.weight, weight_bytes_num, hipMemcpyDeviceToHost)); + HIP_CHECK_ERROR( + hipMemcpy(&wei.data()[0], outputs.weight, weight_bytes_num, hipMemcpyDeviceToHost)); + ck_tile::check_err(wei, ref, "\tError: Incorrect results!"); +} + +template +std::tuple +get_rtol_atol(const int num_accums, const int num_accums_split_k, const float max_accumulated_value) +{ + using WeiDataType = + std::conditional_t>; + using ComputeType = WeiDataType; + using AccDataType = float; + + auto rtol = ck_tile::get_relative_threshold( + num_accums / num_accums_split_k); + auto atol = ck_tile::get_absolute_threshold( + max_accumulated_value / num_accums_split_k, num_accums / num_accums_split_k); + // Calculate error due to split_k accumulation + auto rtol_split_k = + ck_tile::get_relative_threshold(num_accums_split_k); + auto atol_split_k = ck_tile::get_absolute_threshold( + max_accumulated_value, num_accums_split_k); + // Use higher threshold + rtol = std::max(rtol, rtol_split_k); + atol = std::max(atol, atol_split_k); + return std::make_tuple(rtol, atol); +} + +/// @brief `run_grouped_conv_backward_weight_tile_algs()` run all grouped conv fwd instances. +/// +/// @tparam SIGNATURE Forward convolution signature. +/// +/// @see run_grouped_conv_backward_weight_tile_algs() +template +std::tuple +run_grouped_conv_backward_weight_tile_algs(const ckt::Args& args, + const std::string& split_k, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) +{ + float best_avg_time = std::numeric_limits::max(); + std::string best_op_name, op_name; + int best_split_k; + bool is_supported; + float avg_time; + bool all_instances_valid = true; + + using DataType = + std::conditional_t>; + + auto reference = ckt::alloc_outputs(args); + using ReferenceInstance = + typename ckb::ConvBuilder::Instance; + auto ref_conv = ReferenceInstance{}; + auto ref_result = ckt::run(ref_conv, args, inputs, reference.get()); + + const auto conv_param = args.to_ck_tile_conv_param(); + + // Get max possible value in the output + const std::size_t weight_bytes_num = conv_param.template GetWeightByte(); + std::vector ref(weight_bytes_num / sizeof(DataType)); + HIP_CHECK_ERROR( + hipMemcpy(&ref.data()[0], reference.get().weight, weight_bytes_num, hipMemcpyDeviceToHost)); + const float max_accumulated_value = *std::max_element(ref.begin(), ref.end()); + const index_t num_accums = std::accumulate(std::begin(conv_param.output_spatial_lengths_), + std::end(conv_param.output_spatial_lengths_), + static_cast(1), + std::multiplies()) * + conv_param.N_; + const auto split_k_values = get_split_k_values(split_k); + + auto run_alg = [&](auto&& run_alg_func) { + for(auto& k_batch : split_k_values) + { + std::tie(is_supported, avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); + if(is_supported) + { + ckt::ValidationReport report; + auto&& [rtol, atol] = + get_rtol_atol(num_accums, k_batch, max_accumulated_value); + ckt::Outputs::reflect( + args, + [&](std::string_view name, + const auto& desc, + void* ckt::Outputs::*ptr) { + report.check(name, desc, outputs.*ptr, reference.get().*ptr, rtol, atol); + }); + + const bool valid = report.get_errors().empty(); + if(valid) + { + best_avg_time = std::min(best_avg_time, avg_time); + best_op_name = best_avg_time < avg_time ? best_op_name : op_name; + best_split_k = best_avg_time < avg_time ? best_split_k : k_batch; + std::cout << "[Valid] Perf: " << std::setw(10) << avg_time << " ms," << " " + << op_name << ", SplitK " << k_batch << std::endl; + } + else + { + std::cout << "[Error] " << op_name << ", SplitK " << k_batch << std::endl; + for(const auto& error : report.get_errors()) + { + std::cout << "\tNumber of incorrect values: " << error.wrong_elements + << " Is all zero:" << error.is_all_zero() + << " max err: " << error.max_error << std::endl; + // Check with cpu verification to get a values + run_cpu_validation(args, outputs, reference.get()); + } + all_instances_valid = false; + } + } + else + { + std::cout << "[Not supported] " << op_name << ", SplitK " << k_batch << std::endl; + } + } + }; + + if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP16_BWD_WEIGHT) + { +#include "../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_nhwgc_fp16_calls.inc" + } + else if constexpr(SIGNATURE == SIGNATURE_NHWGC_BF16_BWD_WEIGHT) + { +#include "../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_nhwgc_bf16_calls.inc" + } + else if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP32_BWD_WEIGHT) + { +#include "../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_nhwgc_fp32_calls.inc" + } + else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP16_BWD_WEIGHT) + { +#include "../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_ndhwgc_fp16_calls.inc" + } + else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_BF16_BWD_WEIGHT) + { +#include "../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_ndhwgc_bf16_calls.inc" + } + else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP32_BWD_WEIGHT) + { +#include "../../experimental/grouped_convolution_tile_instances/instances/backward_weight/grouped_convolution_backward_weight_tile_ndhwgc_fp32_calls.inc" + } + else + { + std::cout << "Signature not supported" << std::endl; + return std::make_tuple(false, best_avg_time, best_op_name, best_split_k); + } + return std::make_tuple(all_instances_valid, best_avg_time, best_op_name, best_split_k); +} + +} // namespace ck_tile::builder::profiling diff --git a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp index 56f8f30785..a9331d6456 100644 --- a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp +++ b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp @@ -23,79 +23,12 @@ namespace ck_tile::builder::profiling { namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_fp32.inc" -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_bf16.inc" -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_fp16.inc" -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_fp32.inc" -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_bf16.inc" -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_fp16.inc" - -template -auto parse_conv_args(int arg_idx, char* const argv[]) -{ - const std::size_t G = static_cast(std::stol(argv[arg_idx++])); - const std::size_t N = static_cast(std::stol(argv[arg_idx++])); - const std::size_t K = static_cast(std::stol(argv[arg_idx++])); - const std::size_t C = static_cast(std::stol(argv[arg_idx++])); - - constexpr auto num_dim_spatial = SIGNATURE.spatial_dim; - - std::vector filter_spatial_lengths(num_dim_spatial); - std::vector input_spatial_lengths(num_dim_spatial); - std::vector conv_filter_strides(num_dim_spatial); - std::vector conv_filter_dilations(num_dim_spatial); - std::vector input_left_pads(num_dim_spatial); - std::vector input_right_pads(num_dim_spatial); - for(int i = 0; i < num_dim_spatial; ++i) - { - filter_spatial_lengths[i] = static_cast(std::stol(argv[arg_idx++])); - } - - for(int i = 0; i < num_dim_spatial; ++i) - { - input_spatial_lengths[i] = static_cast(std::stol(argv[arg_idx++])); - } - - for(int i = 0; i < num_dim_spatial; ++i) - { - conv_filter_strides[i] = static_cast(std::stol(argv[arg_idx++])); - } - - for(int i = 0; i < num_dim_spatial; ++i) - { - conv_filter_dilations[i] = static_cast(std::stol(argv[arg_idx++])); - } - - for(int i = 0; i < num_dim_spatial; ++i) - { - input_left_pads[i] = static_cast(std::stol(argv[arg_idx++])); - } - - for(int i = 0; i < num_dim_spatial; ++i) - { - input_right_pads[i] = static_cast(std::stol(argv[arg_idx++])); - } - - ckt::Args args = { - .lengths = - { - .batch_size = N, - .groups = G, - .input_channels = C, - .output_channels = K, - .image = ckt::filter_extent_from_vector(input_spatial_lengths), - .filter = ckt::filter_extent_from_vector(filter_spatial_lengths), - }, - .filter_strides = ckt::filter_extent_from_vector(conv_filter_strides), - .filter_dilation = ckt::filter_extent_from_vector(conv_filter_dilations), - .input_left_pad = ckt::filter_extent_from_vector(input_left_pads), - .input_right_pad = ckt::filter_extent_from_vector(input_right_pads), - .a_elementwise_op = {}, - .b_elementwise_op = {}, - .cde_elementwise_op = {}, - }; - return args; -} +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_nhwgc_fp32.inc" +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_nhwgc_bf16.inc" +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_nhwgc_fp16.inc" +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_ndhwgc_fp32.inc" +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_ndhwgc_bf16.inc" +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_ndhwgc_fp16.inc" template void run_cpu_validation(const ckt::Args& args, @@ -189,27 +122,27 @@ run_grouped_conv_forward_tile_algs(const ckt::Args& args, if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP16_FWD) { -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NHWGC_BF16_FWD) { -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_bf16_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_nhwgc_bf16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP32_FWD) { -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_fp32_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_nhwgc_fp32_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP16_FWD) { -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_fp16_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_ndhwgc_fp16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_BF16_FWD) { -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_bf16_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_ndhwgc_bf16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP32_FWD) { -#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_fp32_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/instances/forward/grouped_convolution_forward_tile_ndhwgc_fp32_calls.inc" } else { diff --git a/profiler/include/profiler/grouped_convolution_signatures.hpp b/profiler/include/profiler/grouped_convolution_signatures.hpp index 0f87e283bb..6917d8588d 100644 --- a/profiler/include/profiler/grouped_convolution_signatures.hpp +++ b/profiler/include/profiler/grouped_convolution_signatures.hpp @@ -67,4 +67,62 @@ constexpr auto SIGNATURE_NDHWGC_FP16_FWD = .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; +///////////////////////////////////////// +// BWD WEIGHT signatures +////////////////////////////////////////// + +constexpr auto SIGNATURE_NHWGC_BF16_BWD_WEIGHT = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::BACKWARD_WEIGHT, + .data_type = ckb::DataType::BF16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +constexpr auto SIGNATURE_NHWGC_FP16_BWD_WEIGHT = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::BACKWARD_WEIGHT, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +constexpr auto SIGNATURE_NHWGC_FP32_BWD_WEIGHT = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::BACKWARD_WEIGHT, + .data_type = ckb::DataType::FP32, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +constexpr auto SIGNATURE_NDHWGC_BF16_BWD_WEIGHT = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::BACKWARD_WEIGHT, + .data_type = ckb::DataType::BF16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + +constexpr auto SIGNATURE_NDHWGC_FP16_BWD_WEIGHT = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::BACKWARD_WEIGHT, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + +constexpr auto SIGNATURE_NDHWGC_FP32_BWD_WEIGHT = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::BACKWARD_WEIGHT, + .data_type = ckb::DataType::FP32, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + } // namespace ck_tile::builder::profiling diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index c7734b3a4d..aff47e282e 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -25,6 +25,58 @@ namespace ck { namespace profiler { +namespace bwd_data { +template +void print_instances() +{ + using DeviceOp = + ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD, + InLayout, + OutDataType, + WeiDataType, + ck::Tuple<>, + InDataType, + OutElementOp, + WeiElementOp, + InElementOp, + ComputeDataType, + ComputeDataType>; + + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + for(const auto& op_ptr : op_ptrs) + { +#ifdef CK_EXPERIMENTAL_BUILDER + const auto& instance_str = op_ptr->GetInstanceString(); + if(!instance_str.empty()) + { + std::cout << instance_str << std::endl; + } + else + { + std::cout << op_ptr->GetTypeString() << std::endl; + } +#else + std::cout << op_ptr->GetTypeString() << std::endl; +#endif + } +} +} // namespace bwd_data + template +void print_instances() +{ + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight; + + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + for(const auto& op_ptr : op_ptrs) + { +#ifdef CK_EXPERIMENTAL_BUILDER + const auto& instance_str = op_ptr->GetInstanceString(); + if(!instance_str.empty()) + { + std::cout << instance_str << std::endl; + } + else + { + std::cout << op_ptr->GetTypeString() << std::endl; + } +#else + std::cout << op_ptr->GetTypeString() << std::endl; +#endif + } +} +} // namespace bwd_weight + template +void print_instances() +{ + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple<>, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + ComputeTypeA, + ComputeTypeB>; + + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + for(const auto& op_ptr : op_ptrs) + { + std::cout << op_ptr->GetTypeString() << std::endl; + } +} +} // namespace fwd + template +#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" + +namespace ck_tile::builder::profiling { + +namespace ckt = ck_tile::builder::test; + +template +auto parse_conv_args(int arg_idx, char* const argv[]) +{ + const std::size_t G = static_cast(std::stol(argv[arg_idx++])); + const std::size_t N = static_cast(std::stol(argv[arg_idx++])); + const std::size_t K = static_cast(std::stol(argv[arg_idx++])); + const std::size_t C = static_cast(std::stol(argv[arg_idx++])); + + constexpr auto num_dim_spatial = SIGNATURE.spatial_dim; + + std::vector filter_spatial_lengths(num_dim_spatial); + std::vector input_spatial_lengths(num_dim_spatial); + std::vector conv_filter_strides(num_dim_spatial); + std::vector conv_filter_dilations(num_dim_spatial); + std::vector input_left_pads(num_dim_spatial); + std::vector input_right_pads(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + filter_spatial_lengths[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_spatial_lengths[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + conv_filter_strides[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + conv_filter_dilations[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_left_pads[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_right_pads[i] = static_cast(std::stol(argv[arg_idx++])); + } + + ckt::Args args = { + .lengths = + { + .batch_size = N, + .groups = G, + .input_channels = C, + .output_channels = K, + .image = ckt::filter_extent_from_vector(input_spatial_lengths), + .filter = ckt::filter_extent_from_vector(filter_spatial_lengths), + }, + .filter_strides = ckt::filter_extent_from_vector(conv_filter_strides), + .filter_dilation = ckt::filter_extent_from_vector(conv_filter_dilations), + .input_left_pad = ckt::filter_extent_from_vector(input_left_pads), + .input_right_pad = ckt::filter_extent_from_vector(input_right_pads), + .a_elementwise_op = {}, + .b_elementwise_op = {}, + .cde_elementwise_op = {}, + }; + return args; +} + +} // namespace ck_tile::builder::profiling diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index ccd5b06ee3..2917b79f0b 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -45,6 +45,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") endif() if(CK_EXPERIMENTAL_BUILDER) list(APPEND PROFILER_OPS profile_grouped_conv_fwd_tile.cpp) + list(APPEND PROFILER_OPS profile_grouped_conv_bwd_weight_tile.cpp) endif() endif() @@ -273,6 +274,7 @@ endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") if(CK_EXPERIMENTAL_BUILDER) list(APPEND DEVICE_INSTANCES device_grouped_conv_fwd_tile_instances) + list(APPEND DEVICE_INSTANCES device_grouped_conv_bwd_weight_tile_instances) endif() endif() diff --git a/profiler/src/profile_grouped_conv_bwd_data.cpp b/profiler/src/profile_grouped_conv_bwd_data.cpp index c9727d45a5..cc7ce88996 100644 --- a/profiler/src/profile_grouped_conv_bwd_data.cpp +++ b/profiler/src/profile_grouped_conv_bwd_data.cpp @@ -55,10 +55,263 @@ static void print_helper_msg() // clang-format on } +void print_bwd_data_instances(ConvDataType data_type, + ConvLayout layout, + ck::index_t num_dim_spatial) +{ + auto print_available_instances = [&](auto num_dim_spatial_tmp, + auto in_layout, + auto wei_layout, + auto out_layout, + auto in_type, + auto wei_type, + auto out_type, + auto compute_type) { + constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value; + + using InLayout = decltype(in_layout); + using WeiLayout = decltype(wei_layout); + using OutLayout = decltype(out_layout); + + using InDataType = decltype(in_type); + using WeiDataType = decltype(wei_type); + using OutDataType = decltype(out_type); + + using ComputeType = decltype(compute_type); + + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + ck::profiler::bwd_data::print_instances(); + }; + + constexpr auto I2 = ck::Number<2>{}; + constexpr auto I3 = ck::Number<3>{}; + + using F32 = float; + using F16 = ck::half_t; + using BF16 = ck::bhalf_t; + using TF32 = ck::tf32_t; + + using namespace ck::tensor_layout::convolution; + using namespace ck::profiler; + + if(num_dim_spatial == 2) + { + if(layout == ConvLayout::GNHWC_GKYXC_GNHWK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, TF32{}); + } + } + else if(layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, TF32{}); + } + } + else if(layout == ConvLayout::NGCHW_GKYXC_NGKHW) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I2, NGCHW{}, GKYXC{}, NGKHW{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, NGCHW{}, GKYXC{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I2, NGCHW{}, GKYXC{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I2, NGCHW{}, GKYXC{}, NGKHW{}, F32{}, F32{}, F32{}, TF32{}); + } + } + else if(layout == ConvLayout::NGCHW_GKCYX_NGKHW) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, TF32{}); + } + } + } + else if(num_dim_spatial == 3) + { + if(layout == ConvLayout::GNHWC_GKYXC_GNHWK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, TF32{}); + } + } + else if(layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, TF32{}); + } + } + else if(layout == ConvLayout::NGCHW_GKYXC_NGKHW) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, F32{}, F32{}, F32{}, TF32{}); + } + } + else if(layout == ConvLayout::NGCHW_GKYXC_NGKHW) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, TF32{}); + } + } + } + + std::cout << "[CK_PROFILER] This data_type & layout is not implemented" << std::endl; +} + } // namespace int profile_grouped_conv_bwd_data(int argc, char* argv[]) { + if(argc == 6 && std::string(argv[5]) == "--instances") + { + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const ck::index_t num_dim_spatial = static_cast(std::stoi(argv[4])); + + print_bwd_data_instances(data_type, layout, num_dim_spatial); + return 0; + } // Parse optional named arguments first ck::index_t instance_index = -1; bool list_instances = false; diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp index bc6a1d3323..b15b639c05 100644 --- a/profiler/src/profile_grouped_conv_bwd_weight.cpp +++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp @@ -69,10 +69,297 @@ static void print_helper_msg() << std::endl; } +void print_bwd_weight_instances(ConvDataType data_type, + ConvLayout layout, + ck::index_t num_dim_spatial) +{ + + auto print_available_instances = [&](auto num_dim_spatial_tmp, + auto in_layout, + auto wei_layout, + auto out_layout, + auto in_type, + auto wei_type, + auto out_type, + auto compute_type_a, + auto compute_type_b) { + constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value; + + using InLayout = decltype(in_layout); + using WeiLayout = decltype(wei_layout); + using OutLayout = decltype(out_layout); + + using InDataType = decltype(in_type); + using WeiDataType = decltype(wei_type); + using OutDataType = decltype(out_type); + + using ComputeTypeA = decltype(compute_type_a); + using ComputeTypeB = decltype(compute_type_b); + + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + ck::profiler::bwd_weight::print_instances(); + }; + + constexpr auto I1 = ck::Number<1>{}; + constexpr auto I2 = ck::Number<2>{}; + constexpr auto I3 = ck::Number<3>{}; + + using F32 = float; + using F16 = ck::half_t; + using BF16 = ck::bhalf_t; + using F8 = ck::f8_t; + using BF8 = ck::bf8_t; + using TF32 = ck::tf32_t; + + using namespace ck::tensor_layout::convolution; + + if(num_dim_spatial == 1 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_F32_BF16) + { + // fp32 atomic add is used for weight tensor in bf16 kernel + return print_available_instances( + I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_F32_BF16) + { + // fp32 atomic add is used for weight tensor in bf16 kernel + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_F32_BF16) + { + // fp32 atomic add is used for weight tensor in bf16 kernel + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKYXC_NGKHW) + { + if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, NGCHW{}, GKYXC{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + // fp32 atomic add is used for weight tensor in bf16 kernel + return print_available_instances( + I2, NGCHW{}, GKYXC{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + } + else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKCYX_NGKHW) + { + if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_F32_BF16) + { + // fp32 atomic add is used for weight tensor in bf16 kernel + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::I8_I8_I8) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_F32_BF16) + { + // fp32 atomic add is used for weight tensor in bf16 kernel + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + if(data_type == ConvDataType::F16_F16_F16_BF8_F8) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{}); + } + else if(data_type == ConvDataType::I8_I8_I8) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + else if(num_dim_spatial == 3 && layout == ConvLayout::NGCHW_GKYXC_NGKHW) + { + if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I3, NGCDHW{}, GKZYXC{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + } + else if(num_dim_spatial == 3 && layout == ConvLayout::NGCHW_GKCYX_NGKHW) + { + if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + + std::cout << "[CK_PROFILER] This data_type & layout is not implemented." << std::endl; +} + } // namespace int profile_grouped_conv_bwd_weight(int argc, char* argv[]) { + if(argc == 6 && std::string(argv[5]) == "--instances") + { + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const ck::index_t num_dim_spatial = static_cast(std::stoi(argv[4])); + + print_bwd_weight_instances(data_type, layout, num_dim_spatial); + return 0; + } // Parse optional named arguments first ck::index_t instance_index = -1; bool list_instances = false; diff --git a/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp b/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp new file mode 100644 index 0000000000..348ea1023f --- /dev/null +++ b/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp @@ -0,0 +1,233 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include + +#include "ck_tile/builder/testing/conv/ck_tile.hpp" +#include "ck_tile/host/device_prop.hpp" +#include "profiler/grouped_convolution_backward_weight_tile_algs.hpp" +#include "profiler/tile_profiler_utils.hpp" + +#include "profiler_operation_registry.hpp" + +namespace { + +enum struct ConvLayout +{ + GNCHW_GKCYX_GNKHW, // 0 + GNHWC_GKYXC_GNHWK, // 1 + NHWGC_GKYXC_NHWGK, // 2 + NGCHW_GKYXC_NGKHW, // 3 + NGCHW_GKCYX_NGKHW, // 4 +}; + +std::ostream& operator<<(std::ostream& os, const ConvLayout& layout) +{ + using ck::operator<<; + switch(layout) + { + case ConvLayout::GNCHW_GKCYX_GNKHW: + os << "Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, N, K, Ho, Wo]"; + break; + case ConvLayout::GNHWC_GKYXC_GNHWK: + os << "Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]"; + break; + case ConvLayout::NHWGC_GKYXC_NHWGK: + os << "Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]"; + break; + case ConvLayout::NGCHW_GKYXC_NGKHW: + os << "Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, G, K, Ho, Wo]"; + break; + case ConvLayout::NGCHW_GKCYX_NGKHW: + os << "Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, G, K, Ho, Wo]"; + break; + default: os << "unknown layout"; + } + + return os; +} + +enum struct ConvDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_FP32_BF16, // 2 + F16_F16_F16_GEMM_BF8, // 3 + INT8_INT8_INT8, // 4 + BF16_BF16_BF16, // 5 + F32_F32_F32_COMP_TF32 // 6 +}; + +std::ostream& operator<<(std::ostream& os, const ConvDataType& data_type) +{ + using ck::operator<<; + switch(data_type) + { + case ConvDataType::F32_F32_F32: os << "Input fp32, Weight fp32, Output fp32"; break; + case ConvDataType::F16_F16_F16: os << "Input fp16, Weight fp16, Output fp16"; break; + case ConvDataType::BF16_FP32_BF16: os << "Input bf16, Weight fp32, Output bf16"; break; + case ConvDataType::F16_F16_F16_GEMM_BF8: + os << "Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8"; + break; + case ConvDataType::INT8_INT8_INT8: os << "Input int8, Weight int8, Output int8"; break; + case ConvDataType::BF16_BF16_BF16: os << "Input bf16, Weight bf16, Output bf16"; break; + case ConvDataType::F32_F32_F32_COMP_TF32: + os << "Input fp32, Weight fp32, Output fp32, Compute tf32"; + break; + default: os << "unknown data type"; + } + + return os; +} + +#define OP_NAME "grouped_conv_bwd_weight_tile" +#define OP_DESC "Grouped Convolution Backward Weight (CK Tile)" + +static void print_helper_msg() +{ + std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" + << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" + << " 1: Input fp16, Weight fp16, Output fp16\n" + << " 2: Input bf16, Weight fp32, Output bf16\n" + << " 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8\n" + << " 4: Input int8, Weight int8, Output int8\n" + << " 5: Input bf16, Weight bf16, Output bf16\n" + << " 6: Input fp32, Weight fp32, Output fp32, Compute tf32)\n" + << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, " + "N, K, Ho, Wo]\n" + << " 1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, " + "N, Ho, Wo, K]\n" + << " 2: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, " + "Ho, Wo, G, K]\n" + << " 3: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, " + "G, K, Ho, Wo]\n" + << " 4: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, " + "G, K, Ho, Wo]\n" + << "arg4: verification (0: no, 1: yes)\n" + << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" + << "arg6: print tensor value (0: no; 1: yes)\n" + << "arg7: time kernel (0: no, 1: yes)\n" + << ck::utils::conv::get_conv_param_parser_helper_msg() + << " SplitK (-1 for internally computed split-K value, positive value to set k " + "batches explicitly, or 'all' to test all internal split-K values)\n" + << std::endl; +} + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; +namespace ckp = ck_tile::builder::profiling; + +template +int call_profiler(const ckt::Args& args, const std::string& split_k, bool time_kernel) +{ + auto inputs = ckt::alloc_inputs(args); + auto outputs = ckt::alloc_outputs(args); + ckt::init_inputs(args, inputs.get()); + + std::cout << args.make_input_descriptor() << std::endl; + std::cout << args.make_weight_descriptor() << std::endl; + std::cout << args.make_output_descriptor() << std::endl; + auto&& [valid, avg_time, op_name, best_split_k] = + ckp::run_grouped_conv_backward_weight_tile_algs( + args, + split_k, + inputs.get(), + outputs.get(), + ck_tile::stream_config{nullptr, time_kernel}); + if(time_kernel) + { + std::cout << "\nBest configuration parameters:" << "\n\tname: " << op_name + << "\n\tavg_time: " << avg_time << ", SplitK " << best_split_k << std::endl; + } + return !valid; +} + +} // namespace + +int profile_grouped_conv_bwd_weight_tile(int argc, char* argv[]) +{ + // 8 for control, 1 for num_dim_spatial + if(argc < 9) + { + print_helper_msg(); + return 1; + } + + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const bool time_kernel = std::stoi(argv[7]); + const int num_dim_spatial = std::stoi(argv[8]); + + // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial, 1 for split-K + if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1) + { + print_helper_msg(); + return 1; + } + + constexpr ck_tile::index_t conv_params_start_idx = 9; + + std::cout << "IMPORTANT: Generate instances using: python " + "experimental/builder/src/generate_instances.py --mode=profiler and rerun cmake" + << std::endl; + + std::cout << "Data type: " << data_type << std::endl; + std::cout << "Layout: " << layout << std::endl; + const auto params = + ck::utils::conv::parse_conv_param(num_dim_spatial, conv_params_start_idx, argv); + std::cout << params << std::endl; + + const std::string& split_k = std::string(argv[8 + 1 + 4 + 6 * num_dim_spatial]); + std::cout << "Split-K: " << split_k << std::endl; + + if(layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(num_dim_spatial == 2) + { + if(data_type == ConvDataType::F16_F16_F16) + { + constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP16_BWD_WEIGHT; + return call_profiler( + ckp::parse_conv_args(conv_params_start_idx, argv), + split_k, + time_kernel); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_BF16_BWD_WEIGHT; + return call_profiler( + ckp::parse_conv_args(conv_params_start_idx, argv), + split_k, + time_kernel); + } + } + else if(num_dim_spatial == 3) + { + if(data_type == ConvDataType::F16_F16_F16) + { + constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP16_BWD_WEIGHT; + return call_profiler( + ckp::parse_conv_args(conv_params_start_idx, argv), + split_k, + time_kernel); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_BF16_BWD_WEIGHT; + return call_profiler( + ckp::parse_conv_args(conv_params_start_idx, argv), + split_k, + time_kernel); + } + } + } + + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_bwd_weight_tile); diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd.cpp index 5d21ee2672..301736b4ad 100644 --- a/profiler/src/profile_grouped_conv_fwd.cpp +++ b/profiler/src/profile_grouped_conv_fwd.cpp @@ -73,10 +73,338 @@ static void print_helper_msg() // clang-format on } +void print_fwd_instances(ConvDataType data_type, ConvLayout layout, ck::index_t num_dim_spatial) +{ + + auto print_available_instances = [&](auto num_dim_spatial_tmp, + auto in_layout, + auto wei_layout, + auto out_layout, + auto in_type, + auto wei_type, + auto out_type, + auto compute_type_a, + auto compute_type_b) { + constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value; + + using InLayout = decltype(in_layout); + using WeiLayout = decltype(wei_layout); + using OutLayout = decltype(out_layout); + + using InDataType = decltype(in_type); + using WeiDataType = decltype(wei_type); + using OutDataType = decltype(out_type); + + using ComputeTypeA = decltype(compute_type_a); + using ComputeTypeB = decltype(compute_type_b); + + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + ck::profiler::fwd::print_instances(); + }; + + constexpr auto I1 = ck::Number<1>{}; + constexpr auto I2 = ck::Number<2>{}; + constexpr auto I3 = ck::Number<3>{}; + + using F32 = float; + using F16 = ck::half_t; + using BF16 = ck::bhalf_t; + using F8 = ck::f8_t; + using BF8 = ck::bf8_t; + using TF32 = ck::tf32_t; + using INT8 = int8_t; + + using namespace ck::tensor_layout::convolution; + + // GNHWC_GKYXC_GNHWK + if(num_dim_spatial == 1 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::INT8_INT8_INT8) + { + return print_available_instances( + I1, GNWC{}, GKXC{}, GNWK{}, INT8{}, INT8{}, INT8{}, INT8{}, INT8{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::INT8_INT8_INT8) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, INT8{}, INT8{}, INT8{}, INT8{}, INT8{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::INT8_INT8_INT8) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, INT8{}, INT8{}, INT8{}, INT8{}, INT8{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + // NHWGC_GKYXC_NHWGK + else if(num_dim_spatial == 1 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I1, NWGC{}, GKXC{}, NWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I1, NWGC{}, GKXC{}, NWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I1, NWGC{}, GKXC{}, NWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::INT8_INT8_INT8) + { + return print_available_instances( + I1, NWGC{}, GKXC{}, NWGK{}, INT8{}, INT8{}, INT8{}, INT8{}, INT8{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I1, NWGC{}, GKXC{}, NWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + else if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::INT8_INT8_INT8) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, INT8{}, INT8{}, INT8{}, INT8{}, INT8{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKYXC_NGKHW) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I2, NGCHW{}, GKYXC{}, NGKHW{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, NGCHW{}, GKYXC{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I2, NGCHW{}, GKYXC{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I2, NGCHW{}, GKYXC{}, NGKHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKCYX_NGKHW) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::INT8_INT8_INT8) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, INT8{}, INT8{}, INT8{}, INT8{}, INT8{}); + } + else if(data_type == ConvDataType::F8_F8_F8) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, F8{}, F8{}, F8{}, F8{}); + } + else if(data_type == ConvDataType::BF8_BF8_F8) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF8{}, BF8{}, F8{}, BF8{}, BF8{}); + } + else if(data_type == ConvDataType::F8_BF8_F8) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, BF8{}, F8{}, F8{}, BF8{}); + } + else if(data_type == ConvDataType::BF8_F8_F8) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF8{}, F8{}, F8{}, BF8{}, F8{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + // NGCDHW_GKCZYX_NGKDHW + else if(num_dim_spatial == 3 && layout == ConvLayout::NGCHW_GKCYX_NGKHW) + { + if(data_type == ConvDataType::F32_F32_F32) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, F32{}, F32{}); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F16{}, F16{}, F16{}, F16{}, F16{}); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); + } + else if(data_type == ConvDataType::F32_F32_F32_TF32) + { + return print_available_instances( + I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{}); + } + } + + std::cout << "[CK_PROFILER] This data_type & layout is not implemented" << std::endl; +} + } // namespace int profile_grouped_conv_fwd(int argc, char* argv[]) { + if(argc == 6 && std::string(argv[5]) == "--instances") + { + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const ck::index_t num_dim_spatial = static_cast(std::stoi(argv[4])); + + print_fwd_instances(data_type, layout, num_dim_spatial); + return 0; + } // Parse optional named arguments first ck::index_t instance_index = -1; bool list_instances = false; diff --git a/profiler/src/profile_grouped_conv_fwd_tile.cpp b/profiler/src/profile_grouped_conv_fwd_tile.cpp index 2c436abb8f..6980e8ae10 100644 --- a/profiler/src/profile_grouped_conv_fwd_tile.cpp +++ b/profiler/src/profile_grouped_conv_fwd_tile.cpp @@ -9,6 +9,7 @@ #include "ck_tile/builder/testing/conv/ck_tile.hpp" #include "ck_tile/host/device_prop.hpp" #include "profiler/grouped_convolution_forward_tile_algs.hpp" +#include "profiler/tile_profiler_utils.hpp" #include "profiler_operation_registry.hpp"