Optimization for gridwise group norm (#453)

* use another instance to check the efficiency * optimize group layer norm * 1. coalesce load/store data for gridwise layer norm welford. 2. move a sqrt and divison into a outer static loop * add more instances to layernorm * add 2 more test cases * remove ignore in generating tuple of vector Co-authored-by: Chao Liu <chao.liu2@amd.com> [ROCm/composable_kernel commit: 40942b9098]
2026-05-24 06:44:36 +00:00 · 2022-10-07 10:24:13 +08:00
parent 6530303470
commit 8d0f21b230
4 changed files with 151 additions and 93 deletions
--- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
@@ -55,26 +55,26 @@ using DeviceInstance =
                                                      YElementOp,
                                                      Rank,
                                                      NumReduceDim,
-                                                      256, // BlockSize
-                                                      8,   // ClusterM
-                                                      32,  // ClusterK
-                                                      1,   // SliceM
-                                                      8,   // SliceK
-                                                      1,   // SrcVecDim (0=M, 1=K)
-                                                      8,   // SrcScalarPerVector
-                                                      1,   // GammaVecDim (0=M, 1=K)
-                                                      8,   // GammaScalarPerVector
-                                                      1,   // BetaVecDim (0=M, 1=K)
-                                                      8,   // BetaScalarPerVector
-                                                      8>;  // OutScalarPerVector
+                                                      1024, // BlockSize
+                                                      1,    // ClusterM
+                                                      1024, // ClusterK
+                                                      1,    // SliceM
+                                                      32,   // SliceK
+                                                      1,    // SrcVecDim (0=M, 1=K)
+                                                      2,    // SrcScalarPerVector
+                                                      1,    // GammaVecDim (0=M, 1=K)
+                                                      2,    // GammaScalarPerVector
+                                                      1,    // BetaVecDim (0=M, 1=K)
+                                                      2,    // BetaScalarPerVector
+                                                      2>;   // OutScalarPerVector

 int main(int argc, char* argv[])
 {
-    ck::index_t N = 128;
-    ck::index_t H = 16;
-    ck::index_t W = 16;
+    ck::index_t N = 2;
+    ck::index_t H = 32;
+    ck::index_t W = 32;
    ck::index_t G = 32;
-    ck::index_t C = 40;
+    ck::index_t C = 30;

    if(argc == 1)
    {