tweak on amd

[ROCm/composable_kernel commit: 4908fe3fdc]
2026-05-24 06:44:36 +00:00 · 2019-08-08 12:14:06 -05:00
parent 7b5f9bebc6
commit 807d22de0b
7 changed files with 117 additions and 85 deletions
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -54,7 +54,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());

-#if 1
+#if 0
    constexpr index_t BlockSize = 256;

    constexpr index_t BPerBlock = 128;
@@ -88,41 +88,6 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;

-    constexpr index_t OutThreadCopyDataPerAccess_B = 1;
-#elif 0 // debug
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t BPerBlock = 128;
-    constexpr index_t KPerBlock = 128;
-    constexpr index_t EPerBlock = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopySubLengths_E_B            = Sequence<1, 4>;
-    using InBlockCopyClusterLengths_E_B        = Sequence<8, 32>;
-    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
-    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
-    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
-
-    constexpr index_t InBlockCopyDataPerAccess_B = 1;
-
-    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
-    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
-    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
-
-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
-    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-
    constexpr index_t OutThreadCopyDataPerAccess_B = 1;
 #elif 1
    // 1x1 filter, 8x8 image
@@ -160,6 +125,42 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;

    constexpr index_t OutThreadCopyDataPerAccess_B = 4;
+#elif 0
+    // 1x1 filter, 14x14 image
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_B            = Sequence<2, 2>;
+    using InBlockCopyClusterLengths_E_B        = Sequence<4, 64>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+
+    constexpr index_t InBlockCopyDataPerAccess_B = 2;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+
+    constexpr index_t OutThreadCopyDataPerAccess_B = 2;
 #endif

    constexpr index_t B = N * Ho * Wo;