mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-02 04:31:25 +00:00
[rocm-libraries] ROCm/rocm-libraries#4797 (commit 1a30400)
[CK_TILE] Add CK Tile bwd weight profiler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Motivation To compare old CK and CK Tile, we need to extend the current CK profiler to support running also CK Tile instance with the same API. In order to have the same instance coverage in CK Tile compared to the old CK, I've added code generation from old CK configurations to CK Tile instances using the CK Builder. ## Technical Details - The codegen python script for CK Tile fwd convs is extended to support also bwd weight and bwd data. - The generated instances are added to the CMake build (target `device_grouped_conv_bwd_weight_tile_instance`s). - A new profiler op (`grouped_conv_bwd_weight_tile`) has been added to the CK Profiler.
This commit is contained in:
committed by
assistant-librarian[bot]
parent
fc1e1a5155
commit
ae4e632c7d
@@ -0,0 +1,82 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
@@ -0,0 +1,82 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
@@ -0,0 +1,70 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
@@ -0,0 +1,82 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
@@ -0,0 +1,82 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Default, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
@@ -0,0 +1,70 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Default, 32, 32, 2, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Default, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 32, 64, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 4, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Default, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Default, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
@@ -0,0 +1,16 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
@@ -0,0 +1,16 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
@@ -0,0 +1,14 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
@@ -0,0 +1,16 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
@@ -0,0 +1,16 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Default, 32, 32, 1, 1, 16, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 64, 16, 16, Filter1x1Stride1Pad0, 32, 32, 1, 1, 16, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 64, 16, 16, Filter1x1Stride1Pad0, 16, 16, 1, 1, 16, 1, 1, 1>
|
||||
@@ -0,0 +1,14 @@
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 256, 128, 32, 8, 8, Default, 32, 32, 4, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 64, 128, 32, 8, 8, Default, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 32, 128, 32, 8, 8, Default, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<128, 128, 32, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 128, 32, 8, 8, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 16, 64, 32, 8, 8, Default, 16, 16, 1, 4, 4, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Default, 16, 16, 4, 1, 1, 4, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<64, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 1, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Default, 32, 32, 1, 1, 4, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Default, 16, 16, 1, 1, 8, 2, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 32, 16, 4, 4, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 32, 8, 8, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 1, 1>
|
||||
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 64, 16, 16, 4, 4, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 1, 1, 1>
|
||||
@@ -0,0 +1,233 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,4,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,Seq(8,4,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,1,1,Seq(1,8,1,32),2,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,256,32,8,16,16,1,16,Seq(4,2,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
@@ -0,0 +1,233 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,8,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,Seq(8,8,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,1,1,Seq(1,8,1,32),2,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,256,32,8,16,16,1,16,Seq(4,2,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
@@ -0,0 +1,44 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,1,1,Seq(1,16,1,4),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,1,1,true,1,1,Seq(1,32,1,4),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,1,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,4,32,32,4,2,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,4,32,32,4,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,4,32,32,2,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,4,32,32,2,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,4,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,4,32,32,2,1,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,4,32,32,1,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,4,32,32,1,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,1,1,Seq(1,16,1,4),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,1,1,true,1,1,Seq(1,32,1,4),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,1,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,4,32,32,4,2,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,4,32,32,4,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,4,32,32,2,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,4,32,32,2,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,4,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,4,32,32,2,1,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,4,32,32,1,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,4,32,32,1,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp32,fp32,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp32,fp32,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp32,fp32,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp32,fp32,0,1>
|
||||
@@ -0,0 +1,240 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,32,64,8,16,16,1,1,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,64,64,8,16,16,1,2,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(2,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,32,64,8,32,32,2,1,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(8,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,64,1,4),8,Intrawave,v1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,64,8,32,32,1,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,8),8,Intrawave,v1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,4),8,Intrawave,v1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,4),2,Intrawave,v1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,2,1,false,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,2,1,false,1,1,Seq(1,32,1,4),4,Intrawave,v1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,4,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,Seq(8,4,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,1,1,Seq(1,8,1,32),2,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,256,32,8,16,16,1,16,Seq(4,2,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
@@ -0,0 +1,243 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,2,4,true,1,1,Seq(1,16,1,4),2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,2,1,true,1,1,Seq(1,32,1,4),2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,2,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,8,32,32,4,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,8,32,32,2,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,8,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,8,32,32,2,1,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,8,32,32,1,2,Seq(1,4,8,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,8,32,32,2,1,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,8,32,32,1,2,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,32,64,8,16,16,1,1,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,32,64,8,16,16,1,1,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,1,2>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,64,64,8,16,16,1,2,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(2,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,64,64,8,16,16,1,2,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(2,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,1,2>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,32,64,8,32,32,2,1,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(8,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,64,1,4),8,Intrawave,v1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,64,8,32,32,1,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,8),8,Intrawave,v1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,64,8,32,32,1,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,16,4),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,8),8,Intrawave,v1,fp16,fp16,1,2>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,4),8,Intrawave,v1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,4),2,Intrawave,v1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,2,1,false,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,2,1,false,1,1,Seq(1,32,1,4),4,Intrawave,v1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,8,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,Seq(8,8,1),Seq(2,0,1),Seq(2,0,1),1,8,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,1,1,Seq(1,16,1,16),4,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,2,8,false,1,1,Seq(1,8,1,32),2,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,256,32,8,16,16,1,16,Seq(4,2,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,48,64,32,8,16,16,3,4,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,208,32,8,16,16,4,13,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,13,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 2x2, WaveMap: 2x2, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 1x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
@@ -0,0 +1,44 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,1,1,Seq(1,16,1,4),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,1,1,true,1,1,Seq(1,32,1,4),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,1,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,128,4,4,32,32,4,2,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,128,4,4,32,32,4,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,128,4,4,32,32,2,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,64,4,4,32,32,2,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,64,128,4,4,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,64,4,4,32,32,2,1,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,128,4,4,32,32,1,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,128,32,128,4,4,32,32,1,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,1,4,true,1,1,Seq(1,16,1,4),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,1,1,true,1,1,Seq(1,32,1,4),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,1,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,4,32,32,4,2,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,128,4,4,32,32,4,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,128,4,4,32,32,2,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,64,4,4,32,32,2,2,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,4,32,32,2,2,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,64,4,4,32,32,2,1,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,128,4,4,32,32,1,2,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,128,32,4,4,32,32,2,1,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,4,32,32,1,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,4,1,true,Seq(1,4,32,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,4,4,32,32,1,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,16,1),Seq(0,3,1,2),Seq(0,3,1,2),2,4,4,true,1,1,Seq(1,16,1,16),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp32,fp32,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp32,fp32,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp32,fp32,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,16,16,32,8,16,16,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,1,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp32,fp32,0,1>
|
||||
@@ -0,0 +1,46 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
@@ -0,0 +1,46 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
@@ -0,0 +1,8 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<3,NDHWGC,GKZYXC,NDHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
@@ -0,0 +1,48 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,256,32,64,8,32,32,2,1,Seq(4,32,2),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(8,4,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,64,1,4),8,Intrawave,v1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,bf16,bf16,0,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,2,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,128,32,32,8,32,32,4,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,8,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,bf16,bf16,bf16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,48,32,8,16,16,4,3,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,3,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,bf16,bf16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 2x2, WaveMap: 4x4, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
@@ -0,0 +1,48 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,8,32,32,2,4,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,8,32,32,2,2,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,8,32,32,2,1,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,4,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,16,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,256,128,4,8,32,32,4,2,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,Seq(1,4,16,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,1,1,Seq(1,32,1,8),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,64,128,4,8,32,32,2,2,Seq(1,4,8,4),Seq(0,3,1,2),Seq(0,2,1,3),2,8,2,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,128,32,128,4,8,32,32,1,2,Seq(1,4,4,8),Seq(0,3,1,2),Seq(0,2,1,3),2,8,1,true,Seq(1,4,16,2),Seq(0,3,1,2),Seq(0,2,1,3),2,8,4,true,1,1,Seq(1,32,1,4),8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,128,16,64,64,8,16,16,1,2,Seq(8,2,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(2,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,16,16,2,2,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,Seq(4,8,8),Seq(0,2,1),Seq(0,2,1),1,8,8,false,1,1,Seq(1,32,1,4),8,Intrawave,v1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,32,32,8,32,32,1,1,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,80,32,8,16,16,4,5,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,5,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v2,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),2,Intrawave,v5,fp16,fp16,0,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v1,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,64,8,32,32,1,1,Seq(8,16,1),Seq(2,0,1),Seq(2,0,1),1,4,8,false,Seq(8,32,1),Seq(2,0,1),Seq(2,0,1),1,1,8,false,1,1,Seq(1,4,1,64),1,Intrawave,v1,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,16,128,32,8,16,16,1,8,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,8,false,1,1,Seq(1,4,1,16),1,Intrawave,v1,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,128,32,8,32,32,1,4,Seq(4,4,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,8,8,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,8,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,32,64,32,8,32,32,1,2,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,2,2,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,2,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,32,8,32,32,2,1,Seq(4,16,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,Seq(4,8,1),Seq(2,0,1),Seq(1,0,2),1,4,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,4,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v2,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp16,fp16,fp16,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,112,32,8,16,16,4,7,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,4,4,false,Seq(4,16,1),Seq(2,0,1),Seq(2,0,1),1,7,4,false,1,1,Seq(1,8,1,8),1,Intrawave,v5,1,fp16,fp16,1,1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x256x32, WaveTile: 32x32, K1: 4x4, WaveMap: 4x4, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 128x128x64, WaveTile: 32x32, K1: 4x4, WaveMap: 2x2, VmemReadVec: 8x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 2x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 2x2, WaveMap: 4x1, VmemReadVec: 2x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<Default, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 256x16x64, WaveTile: 16x16, K1: 8x2, WaveMap: 4x1, VmemReadVec: 8x2xSeq(2), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 4x4xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x4, VmemReadVec: 2x2xSeq(4), BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 2x2, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 32x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 3>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 256, BlkTile: 16x256x64, WaveTile: 16x16, K1: 2x4, WaveMap: 1x4, VmemReadVec: 1x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x32x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 64x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 2x1, VmemReadVec: 8x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x128x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x4, VmemReadVec: 2x1xSeq(1), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 64, BlkTile: 16x16x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x1, VmemReadVec: 1x4xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1, BlkGemmPipelinePrefetchStages: 1>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 128x16x64, WaveTile: 16x16, K1: 8x4, WaveMap: 4x1, VmemReadVec: 1x2xSeq(2), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
DeviceGroupedConvBwdWeight_Explicit_Xdl<DeviceBatchedGemmXdlUniversal<MNKPadding, CRR> BlkSize: 128, BlkTile: 16x64x64, WaveTile: 16x16, K1: 4x4, WaveMap: 1x2, VmemReadVec: 1x8xSeq(4), BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2, BlkGemmPipelinePrefetchStages: 2>
|
||||
@@ -0,0 +1,8 @@
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Default,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,128,256,4,4,32,32,2,4,Seq(1,4,32,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,Seq(1,4,64,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,32,1,8),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,64,4,4,32,32,2,2,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,64,64,32,4,4,32,32,2,1,Seq(1,4,16,1),Seq(0,3,1,2),Seq(0,2,1,3),2,4,4,true,Seq(1,4,8,2),Seq(0,3,1,2),Seq(0,2,1,3),2,4,2,true,1,1,Seq(1,16,1,4),4,fp32,fp32,1,1>
|
||||
DeviceGroupedConvBwdWeight_Xdl_CShuffle<2,NHWGC,GKYXC,NHWGK,fp32,fp32,fp32,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,256,64,64,8,8,32,32,1,1,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,Seq(1,8,32,1),Seq(0,3,1,2),Seq(0,3,1,2),2,1,4,true,1,1,Seq(1,4,1,64),1,fp32,fp32,1,1>
|
||||
89
experimental/grouped_convolution_tile_instances/configs/create_configs.sh
Executable file
89
experimental/grouped_convolution_tile_instances/configs/create_configs.sh
Executable file
@@ -0,0 +1,89 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Get flag --update-test-configs-only to skip running the CK profiler and update tests based on the existing profiler configs
|
||||
UPDATE_TEST_CONFIGS_ONLY=false
|
||||
for arg in "$@"; do
|
||||
if [ "$arg" == "--update-test-configs-only" ]; then
|
||||
UPDATE_TEST_CONFIGS_ONLY=true
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$UPDATE_TEST_CONFIGS_ONLY" = false ]; then
|
||||
|
||||
ProfilerPath="../../../build/bin/ckProfiler"
|
||||
|
||||
# Layout: NHWGC-GKYXC-NHWGK (channels last)
|
||||
fwd_layout=1
|
||||
bwd_weight_layout=2
|
||||
bwd_data_layout=1
|
||||
|
||||
# FWD configs
|
||||
mkdir -p forward/profiler
|
||||
|
||||
# 2D
|
||||
dim=2
|
||||
$ProfilerPath grouped_conv_fwd 0 $fwd_layout $dim --instances > forward/profiler/nhwgc_fp32.conf
|
||||
$ProfilerPath grouped_conv_fwd 1 $fwd_layout $dim --instances > forward/profiler/nhwgc_fp16.conf
|
||||
$ProfilerPath grouped_conv_fwd 2 $fwd_layout $dim --instances > forward/profiler/nhwgc_bf16.conf
|
||||
|
||||
# 3D
|
||||
dim=3
|
||||
$ProfilerPath grouped_conv_fwd 2 $fwd_layout $dim --instances > forward/profiler/ndhwgc_bf16.conf
|
||||
$ProfilerPath grouped_conv_fwd 1 $fwd_layout $dim --instances > forward/profiler/ndhwgc_fp16.conf
|
||||
$ProfilerPath grouped_conv_fwd 0 $fwd_layout $dim --instances > forward/profiler/ndhwgc_fp32.conf
|
||||
|
||||
# BWD weight configs
|
||||
mkdir -p backward_weight/profiler
|
||||
|
||||
# 2D
|
||||
dim=2
|
||||
$ProfilerPath grouped_conv_bwd_weight 0 $bwd_weight_layout $dim --instances > backward_weight/profiler/nhwgc_fp32.conf
|
||||
$ProfilerPath grouped_conv_bwd_weight 1 $bwd_weight_layout $dim --instances > backward_weight/profiler/nhwgc_fp16.conf
|
||||
$ProfilerPath grouped_conv_bwd_weight 5 $bwd_weight_layout $dim --instances > backward_weight/profiler/nhwgc_bf16.conf
|
||||
|
||||
#3D
|
||||
dim=3
|
||||
$ProfilerPath grouped_conv_bwd_weight 5 $bwd_weight_layout $dim --instances > backward_weight/profiler/ndhwgc_bf16.conf
|
||||
$ProfilerPath grouped_conv_bwd_weight 1 $bwd_weight_layout $dim --instances > backward_weight/profiler/ndhwgc_fp16.conf
|
||||
$ProfilerPath grouped_conv_bwd_weight 0 $bwd_weight_layout $dim --instances > backward_weight/profiler/ndhwgc_fp32.conf
|
||||
|
||||
# BWD data configs
|
||||
mkdir -p backward_data/profiler
|
||||
|
||||
# 2D
|
||||
dim=2
|
||||
$ProfilerPath grouped_conv_bwd_data 0 $bwd_data_layout $dim --instances > backward_data/profiler/nhwgc_fp32.conf
|
||||
$ProfilerPath grouped_conv_bwd_data 1 $bwd_data_layout $dim --instances > backward_data/profiler/nhwgc_fp16.conf
|
||||
$ProfilerPath grouped_conv_bwd_data 2 $bwd_data_layout $dim --instances > backward_data/profiler/nhwgc_bf16.conf
|
||||
|
||||
#3D
|
||||
dim=3
|
||||
$ProfilerPath grouped_conv_bwd_data 2 $bwd_data_layout $dim --instances > backward_data/profiler/ndhwgc_bf16.conf
|
||||
$ProfilerPath grouped_conv_bwd_data 1 $bwd_data_layout $dim --instances > backward_data/profiler/ndhwgc_fp16.conf
|
||||
$ProfilerPath grouped_conv_bwd_data 0 $bwd_data_layout $dim --instances > backward_data/profiler/ndhwgc_fp32.conf
|
||||
|
||||
fi
|
||||
|
||||
mkdir -p forward/tests
|
||||
mkdir -p backward_weight/tests
|
||||
mkdir -p backward_data/tests
|
||||
|
||||
# Do not change the existing fwd test configs
|
||||
|
||||
# For BWD weight, generate new test configs by taking 20% of the profiler configs for each data type and layout
|
||||
for layout in nhwgc ndhwgc; do
|
||||
for dtype in fp32 fp16 bf16; do
|
||||
profiler_config="backward_weight/profiler/${layout}_${dtype}.conf"
|
||||
test_config="backward_weight/tests/${layout}_${dtype}.conf"
|
||||
awk 'NR % 5 == 0' $profiler_config > $test_config # 20% of lines in the profiler configs
|
||||
done
|
||||
done
|
||||
|
||||
# For BWD data, generate new test configs by taking 20% of the profiler configs for each data type and layout
|
||||
for layout in nhwgc ndhwgc; do
|
||||
for dtype in fp32 fp16 bf16; do
|
||||
profiler_config="backward_data/profiler/${layout}_${dtype}.conf"
|
||||
test_config="backward_data/tests/${layout}_${dtype}.conf"
|
||||
awk 'NR % 5 == 0' $profiler_config > $test_config # 20% of lines in the profiler configs
|
||||
done
|
||||
done
|
||||
@@ -261,4 +261,4 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<64, 16, 16, 64, Filte
|
||||
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4>
|
||||
# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4>
|
||||
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1>
|
||||
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 2, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1>
|
||||
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3_DirectLoad<256, 64, 64, 64, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 2, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1>
|
||||
Reference in New Issue
Block a user