From a4ed178b4c8d4053400deea71baa7ac7a9f13e64 Mon Sep 17 00:00:00 2001 From: OscarXu Date: Mon, 19 May 2025 19:08:15 +0800 Subject: [PATCH] Add gemm2 v3 64x128x128 --- .../hsa/gfx942/moe_bs_stage2_v3_64x128x128.co | Bin 0 -> 27144 bytes .../moe_gemm2_xdl_fp8_blockscale.cpp | 4 ++-- .../device/impl/device_moe_gemm_blockscale.hpp | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) create mode 100755 example/65_gemm_multiply_multiply/hsa/gfx942/moe_bs_stage2_v3_64x128x128.co diff --git a/example/65_gemm_multiply_multiply/hsa/gfx942/moe_bs_stage2_v3_64x128x128.co b/example/65_gemm_multiply_multiply/hsa/gfx942/moe_bs_stage2_v3_64x128x128.co new file mode 100755 index 0000000000000000000000000000000000000000..849c3fa06da7832e5bdcf565fd4c8e60ce382851 GIT binary patch literal 27144 zcmeHw4RjmFb?#uX009sHK?sCMn7=RtfuKZ@APG^lekhuxD4Akq{aKb}k>H;|h$3l{ zk}1}PmM}%iDn-+bqsWeHs*YZ)$Zn#zj$iF0B;_i8uAkb9w3}DEIXSZ1o}>6RZJPeP z#-}yjcV~C`XUTE;a+;UZ;AnVf=FXj&ot?RR?>Don6IHdW@UP0iwNa1`&0B^mC96VP|8s@$=&W(X_GlBMPvL&a_9nM$vnwe z&SHA8PEM1u0lSFZPk{GQ`S8avxx zzfI-&hnxQ(|7cH3Pkr-obfT@PzNa_Tboh-I^N-Ed%f2Zm7S{WX{D(uwT3Wk1F?ci% z<+mQ_Cm(wuzq7Tqr-eqI%3ZsRnnUeJTj~$Dbu?dWJYLUJx4$2MQhvKTVz~7kxpw*g z{g!ByTXM0pxw*c*v$N}S--C+JxCbw)wSAsJzxLFB*-`&m%dwV@Uu@XRK4aKlnyc<} zjr-HT(s9rJ70&icG(S5!f1&vv_>6)7{rUPn&&ZemN(cT|sb*f}GxQf6`RdOY`L8di z@be7)HRQGb1Y7h!|=&oLUsI_lhaX@_rs}Dbo@?e)Ax5E^Llp575FSN zy{42`?Y?rQezJ?HZR($k@+8J=0xK89>dj0L4NQ}rUCx+6ma`eCEZL`I&pp)^{JU#^ zJzEt1rd|)vyp37HZj+D6%+#l-QYKKcrMs>9 z;kKTZ?VT-KT8PR-r_?jQM$A9;q9SF zXLnWIu0VZZ;oY6>M~`%Q`amj!;j}?l3IQL*c4j74?C@_M`2aD|jEP z`XJPMUrXDeaBpqhu66an=9Xh^O)Z6`bS=BOTAJEmfA;fgs|u=W+X5v){4JyqD4qKo zES`%h*}T-^KU5}wYpyTd)$-8MmX0P|WNjN&QWPL@Tobs}xIDQ5-W+A_s;MVQ= z3TLs-W%c#pwyyf7qixOgEv2Qwrk18qVRLh1VNprx+;X^eMA37%W>W4qNCnLQ%H`-~ zC1?6GK|4sPK82n}q{Fj^qU-%{5_W{)LYx)yR(_QHLiRqE4 zN{uj0SD_tpj#ApG(@rJnPEKjrQ ztyZqXIk9}q30^7k*HOM8=Tp6gxM3(BkFyy2PWI$?JYJ_q4C};Gj~a@_sVmZ{lM%z{ zkTv_%!kd;kf236u%?>3x*e>Zi|B+H9I0BbqKv zvN)3t{3ssR#5FOr8=rlH zGF?oAxLsqHmN84Lq}VntF=#$Q0eN>02)L|9GFs1cMIwlAnChh)#ft?VTwBrXNJ7FcNXD2KYsD|kir)6sD zO&Xbsi&ih)$GBeL^qk({7WFr0G>2_sJv%uou7$nfN%agdefA&|Jd)5H)gwPqFR;r@ z@10@>ubAMCq)h0?^uMcrB1Nws&_9+k)*YGr7P7O^KM6WJFgW78F$*c)Y@ z*ws=`?9GiHSIjz^$ex%S`i|%oIuH1NqIF%?GR>@}Bvl*`MW4{=^rvak#`Bkpku<#{ zJn)@zU$#c?AJ#W`^F#WG6i>Sx7YtDeH(?d~_Zh7*S=0#oql03rN8?z}`()A!yib{e zAl=E>Y`(EjOnlH8aie^q@r)R+42LDm!?W`C}ON`@N1le{CG}Um@nm^VN-b{d*~Y*@Aq;>zwS5XHqmdLJFvZ z=o={m*((l4&U-zzXFR^W_FP}>TRxxHp6jVwhcb4}W}%e5HN%WXVX!OWE6MZuOINr1 zhrM1;$!d?k4r6LJZWl}QJpLBMgNWOYcp{8_@*cLO03UqvUJGNH@wdHUasV7he34d^ zbD-QIMV87Of}23Jl`&`h?MB>3?9cS{`b&Dc{p*f&N6L=$MygT1%=kUA26P=LFNg6e z>5TYGJ3IU(M373t9qN@Zf>S;DH8Dyx{=9?*+(TlXOLl3kRbFG{l9* zD9wYzEIw1z*2hBeWxQAcPSj(ZY9T!)*^_w$c4>>yzi@6J3Jzh%hqM z%l`2l{RTmxIr~JrAW_<|c~;kt`Sb|l){B!wUdV%QTlZ6) z9{j+L%kMmL{O$AupwZ*q8IeC`ZNwL-Zj>iIpNX`%`KLOfrjmX!GF2RglAI#G>k-UL z+^=fJPPxE0uj|JMe)1lkPlGy?*^hc%$0Hg0K+}&mX1LfVGumkr>fUj~^#xHn@kb+5 zB&S~?2z}@~%btAiu$Lwbt|vDlW>Wb#cXmwb#+8kWmdXn8C#(ub5)Va z6(t&nz2(%8{g&OJ>Bo0ksD1%$+CfFMLw)6adsFF~Pn3vfTD^^5&o<0mC(`D^ZDQ>D z^>FO^_dRqChrL=!Lt}w!uH|!An%aproX59Wc7UcHudtl>9_$Io>XWqsQA5fD;)$zr zp4Iq(NO?BP10vmzN{o|pk3FLTGCqjokIbh*4SPN`QyH2M z0;sszg*MQK6(*6|#(Kr}Dca68iSc)>ZKUJk?^g2lr5#v4Ol7c^?bMf3s887QzIhN- z+%s$5GLfCQbs|0So{4u&+b7J4EAJl<(|mrC^mZ!dHT3qHtd#H?tb<3iBU4ti(^n$T zOB3`oJu69fdRJ(!X&grqZ|F}9#{4S-9JE3!Fo^)eQQyuQ9_2FWq+cEE`zD($1e(!OOnE&a->nNP% zGe|0LJ<5UvaQek)c}{ufvq<@Fz(#r;Nk zW&S(2&X@09fp$k}*+d3_(7sZ3v+(vKq_irScV0EYJf<@e%xdO?q^mitl$nO;kgwG{N2 zCC^H{F39q6Ix?#x+Kw@%_UIWELAqnLoW}fiQu{&3Ml5$)i$U8h%d9L#oRaSUet1~> zZgk|6lT(t!q~a^+MXGjO3!8c^>}KuSVu@)=V%+W%#=Tv5Za>C^Y^QKNJaK*2?S_3C zhRsX1sGFT|Z-E^nSXbyqM`wN9maDAfwxSz0nY)uQ_dEN-6Z`hNi}$_x=*Z}#&pmY@ zJaOP4wBui0FFqi!)1&)+?gtve6Ag`Sf$|yzc53vX&t3m)c;eaT+#=;YE3l!_MxVQB zIy^D`25e;HO$%&r^f{k<;%a!}>YK2|k#|*KvC(II?k~R;o_OnRw~6xJ5}5mmYvGA& zf9f_<8k-BGuT{N6l=Kd*Pl{VhPK!is>qr+#QxVcvcpmvkU*UN~JkmY1E=9-@rDqVW zOM2FjjQWi6LWZ6(C$7vtQ`F}O=}U^!GlM8SCrFlb06iDnptpK1sLueRr29#R=DZ#H z-2>`?ey2Hs^`A);>z>;>ST{gJ)n@{mWl6_%aqKjGCxNkha1R}3ojAar^5mwls4rzl zG?$&9^#v+&%r#`f%A zZ1Z_p{3+;Z*kugqt!@wZCCMIDuHS2u$HASB`qqvLEHnMhwK#JkGTE9kd5M#h})hYvQm5&aqDwKbf1eDn~`c?X91#h zFEby}9hcZjqC21Gee+Vh@)-snU`DDQXIKm~Qq5_4RLw}+`q{v>8qW%f3x&)Bn{dUZ zC!yaIvuY-&e%V#iKdZ|o2ldqCDJTxaY2`?WS0;n{Xlf7ZXnyp zt=L8$*hZg6H)KM_1^XwPbOY=kTKoFdiv7cNz^CmWJ=QhZKL${;e;8y0$Pg6^1v~@D z{*kFUSKvB+_$D=H#wL3vtd+hA^KQ}j_GMT}~ZFC+*|lOjYJ*0X>#(+bRpxB+<@q%~OU2}tWv z7egA(vAkXx^s(YUD(h8L)+-bk)_*G2R{FC;nXYGsGE>hDWjfCeWjdZ3%CwIUW!lDu zGOeS0-I-_}8oFt}c1AKq$fcW+%#F0$JtJB7^EPx8XFJLDz}ng-uQ#y*6Afv785ZE0{2!`&3@;VxAGaW|fz-m}da5LFJ__=IOyZpz@Lz^K{_tQ+dY4 zJQ2JdDo?YRhqX0OD|j%nxW2DxG`I#~la615hM6VC{y)b?e-8e6NyR@+3jPUPu>m)4 zDgGG%&RGPU^N%#bG}uhWKTnMo7_3sEF)22znxV>PrXmY zKSzelz(0RScWO$~>!-7+LeK)Df-qeUqT-({K|d`a%|sK&Kbcm+KNFYWpYP7&pVET9 z80PR#!X??b3;3rI_@{v_Q}9m%@J}PhKhrgdQU)`{a((tW{L_BL6?5I;8n-FukCKov0sJ78g1GDNg9dbWiJ^5Mc_4z(4bDnJ`ZPilSuF>0)UKrg76m;+QFRden?HWJx6a)8=)a zPZ)OE7UG{*Sj;BloxnE*vMqskn#wu8X*DL;4B-ib$xT?PsJp^#Pw)-XGCx7aKh6Bv zu?YV(0sq`(1D*q{5)WV%`_v>`$F<0_aqQCo?9=4c8H2z+6UGvLH{pqtgoJ?v75j9U zc)jNcn?!v&)EB1uRP58P@a^;bHigfzPn*KG&GRD)pFi`g3g0@1T;?(j zBmf7MvCoKiFjC|Bi8e{bC$r4r(3r_<)t85LhnxL_%?6H(>USB$cGM%_y4!(^9#U~p zry!B9DA)zbuHd5iz(v;p7fm2s^g8g+*^QpqyTC@jU*|~!E}8&bbif1*^swl^6SkQv zDOnU+^c=U;a@?{aO!Hn-zCIk#IU-blwjMYP`J{qI8gy*)#LH_JV45<|YbCn-MtY^l zX;zs`GyF1LnJ;6SWxzBm6-+Z9n5G|?CgH7nfQzlTyOqVuPjL({UQ_Oq7_N77MflL> zas}tSq2QcpqiHJ6Ss>$_)$r8|fOD=Qs^Xlmk~z)^)Q|d~ql{NJ#!G!pjRi}AbCyMa zR$8_UFIvu3L*bT);tt zDh}G->MOdd)F0ds_74Yxo}vxK{zfi;L=4_l>~BK6*my*AwG~I`S=$KrjA#?tU2g{Y z7^b(CMw(GB9pwl&b%9cu7R8@vhCdO1vw_>&!Okv!5%A5mZ5@%)wyp?auBFB+;%?Bj zpu8M#(4xbw{@~$=9~fq&q%F*G&|>fj2Q324Zy_A?nkRnkET2c)k?&7>TBi|qN8`c) zj)iK32E=@#3To!uIV(uR9K(#1fnRUhdzKm0*^{09Xb1LYV3>N(f1k%({GYSNQW zcNX?x>`Z^gQ=o%qVi{fUoeCcWU!Z!a-628pA}!wAucLfZ)iYw(pH=dHxMPB(ViUk3 zyu>$e?bp$L+>Luq*uXgIJ$R-&V+^$6OeCX{aLwy7)>s*mah?5;BO6%7H1}X_?D_Jg zFvkoxM|50fR*`9tWr$}j**}Ma?!h`DU)8p5KgVzgKhnM{%Q)*r39D_2uqgYPMhju0 zjx+l$RGxfnqT=Rl9COlsUy(T|$~b7rJPyiz;dQ`4-+wIK({=yH>FYt$@BgXn1K@gP43uyH^kKDvffmXb zDB(c*`#&@jrenDO{ZC<_@f$7~>%8$QUqgg-3e>OFFx1gDtzSx;3*|JeW9rK&>J#pt z<=Cot|J&xOrdLdNHQg}fHJvi8X-Z9`Iedz$397) z9=Iaw(*xfXY|c=@VZuM9Tp9nQxzS%WkAI$n{-?Z0w(jRX%R%(T`K|rd2GG=R?X*7d z-l{Oo@ndQ`(8V<7eYwwY{PTf7s|^Q`?#9@|XyBNt7kZkmm*ze3dbhsKuV0<_gsld? zwavO8wD?;U);(X?8(u@0rZV@}RLS$0WTUtzglXmtF_!nGOS~;<^V?z`l~?9{-d*$M z`%BOk%I}FW`2V=BbxK)fzUSr17%0v6zz6btFaBV^g~}d7TZAR`p?=;UWxlUjBTB^Y z2F9e$D_}X9l-G;;2?Kqo5OW>ZT#I|EVxWN!vZH8D5dIl>++-pAviNa9?jL07Ui3kh%6CKGESm33evS~2k2%tF^PSiCv3WhO z@26%eLt{&n#*-+G9Z?!H(tkAH18IN7=X-J5Uvim+V? zJr!2*RpQfp*Ptw6p%>*yX(UH*2BM0vKYAM(AeQ&{hP(CI^<{m|*E zKBsXdJx-MLIQ19vx6h5VdFwOs+(sVeFwiJzY7p{n=!R%MKPAmE(0~2*`qn9lXW@pPh6v_yUG6XF3{U_KWN>Mj+Q`grK9cC_j#S{ z-AjFkPF8fVTb(O3PhoFl62<&*5ruAY62(2T$Az;B=6oRAdEk}s#4E45qiAz4boV2m z`=Gn4;Rozj{D5k9LUWp9(5rPvF@LL?cAI*!H&coBdojm|9@}i!JtW9^?Im_Rw}6$e zv-5k>j5;g#xUH?A)q7l4;&mx&u^V!u4ofb#(=q+YPaK9#t+@|pDJ;Jio;d%!J8&Mj zw&O(~t=+JK14n=Z55IUJJaOTxZo;_(7l3m+zU-rWM|%veJ$7p~X!;(Lb@8AkZ`97_cE4LyEVrkTK&9>=UZ@#x)u126ZlUH(#=TM zDCsRoS0lYeNmn6ViFCD+PDyEqI#NzWmm%MtvO7xcK-clP0o`T?GRX1jDwMAb39J&- zk9HnF`wsJK;s9&#Gy!93guIG5J&5!Hs*B13a~e3i+v7+1{N%6Ug@LQr{jV;-pMYx> z)KoLtk5}>Je6%t*ciAWJLAHkQDbPy7lk*%}W=Oxt@#MT?0yDV2%JJm9uOgm$f#b<} z&m-=9p5w`R&mr!3j^oLB&me9e;dpZ1CB$u~Ii8$nMxEAE9M{5Bn~}zzXBg)hY5W<6 zai5W%8fO@f8R_h4-Z$X)Gt#MkhOwHFP6yalq9c9=9DPO_b+h3VO_(rO+9L_TpB!M! zLw1*j_(BeD-@ngjHfhA+7fyvIPMwy^of24JARoB!B>U-& zF<{a!J{6vL>a4uRpAvv6p9@c%dq!T%=a8P5-@^)E4~u=}Jl7e7Ne9j&{er9`_jv7lBHci`N&gSBPBZ+otjDb1m8tEDtk39NLnUlg!ltWX ztCmAXfDBRbp9LLJ(|2WRz9MiPNOo;G>`}5yD`1b7Q9F3v>p|hSK*!K?O|eI}F1AMr znL7ndPj%G*(7!ir2pk>OZ-)wr2s?Nt@#^RH&lVeN@2&ZsHH!LfGJa)Biw zZcRz&Si3caFmgLEa$D30{bX1sFbm@5l+>sJX>&4~C7on4{9LSE#@6Tco(_6XT!OXd zW9=`|WgKgVF5_6cKl?usYkx__u$N-(Sf>|N-cqcc*0;)AinY`FR(VUYcJQ86c}uZ& z@Xo2crC2+7PpQ16SUY%Qx5L^KrT-AtuAM*sTdm;2+95MP(`&~0-*PRT{~elS2@3uV zyu1PTP7B;p9~et&Y|^a-_N~)TXQMtk<4f50H0mh_Wys$LLfE)OX`_O5quhB) zD`$Sg-<`+2foV=NZY*U)98Edx7}Z_T4Tw73 zIJ<&Jo`a3+9KgGoe?XF$lzzn$v)Iu#&OKxS_qJrYC8oPGmJnwg3ug)z&UBC+wv1)n zGM|lEm~{abp2HHd?7+fxqjs%y)~>aWJMBJ6!*XL*EgNUJ1{RY>ZOkSmYm$M5JG|!0 zY}o9Ov2gi3aSn?)OdgzO;pc%(uyq9;p3yi9tYYIh6Kp1IT*pi{Nv28k%rfDOu#AN( zm@>}F8W=F^y9{DnujJ`PJfGyeA;In)Ht2CC*pRKmzJMLJt-~PTJg|z1_eVzh4SRI-PBpmAyl$Jw&jn{I`Pl|M)2L0G$T9IuB|p;u zdsEKmHg>w4A7?Lenc!djmF$a?dF)G*=jbf5B5R4UQ4G*j zHu`VYkS;qpYXZ#y)q`e%ilB4`I1d=N!0XeF4%kzDWBmqTxu)pP45qz0!g5U}ub616 zjf2v)iiw6glX9(V6`XaLb$JK7%pT)IPJcRV$wZ??q`nxEQsJxJoDo3D{}Zix;M&02 zu^(lANc0OhJA4OhAOU#pU`MZS?aGLMxL@>?tqIQ;mXxtRZ`lRbn52!CKegEJY2rZN#uBZsQoB;56~JS76=8#j-i< zeT5+BXV{7ExRJd|$7$v|ndSs!nibSZT43B%CIc7;o#P~IBi+i_am2|-yhAUqVfCy< zF%3OzV>kpZX87bi4~Cp5_6+p#%0}+jnHzS8CwA|3yD_i*d|vZ&M4Q%Q90lw5#L>RJ zzT@!U$oF-^CLo%M{ddA3xqqzqX6DV$h$DRmeKem47g_aitw}w@V^_}bIFvIys~G&{ z?Sjb%>H#HRO;CI_Q8}~Y2G0ea7c?DI^}n1Mvj3HuL+3bQ(;!WlmN}-}Y$D*S6VZS=2wsN5gx z>-Xgxu83qE-WbWkJU-nuxa*36L79apRzB?Qa^bQ z$Mda3OE4Gc?9b{gbjITR?5Zk(!Oox6o3G5_N%8PL?&l5c>jswl>|d~+M1CEeGm3Y2&G~Gls9Cjh1{#?MZO<0qJQ2-J#QAw3P<|dL3uiwvdrv(YNgv`q zTvorpBCbL1!(}~!cxsIMa9OR0J0d z$3*4xF{`1^*e1qSA7|XJbF()ebq{!<+g_L)fN!TEdF=nIejWGSN{rI#-<~f|WS||{&vL8&Ss}_F`h4YA*Do$d z?P&@izr7|rQL_!tDYS>{AE=SrgC0S9vd?!X+FSkL;&NPml~JPdOUrUzE_mqsE&J^Q z?{;Ii(kPwZEsu+QZYc*t_h}zdqudwGeTt6T{pO>Zdik6ZAG>|f)DOz%l=!t_PH46_ z@UxGE&A1!nb4LsH-cp@Mc&Fx&d@hRoNH^wK;9+JWJ-Jlx1+czx#?Dj>iZx==DEE>4 zO}dYo-P4a^Q%hW z+Z5xBQUcaLonb0MI)JoZN#FVmlbxSo!Yd<;>lOXaKf{y;KPq(rp0pfk=MulFbctUD z-NOAUQ0`Y{+Wr&$s=2d4OZ_TZ7pgC_)UTp-q4JjcRp8aCyrq5>cys5QmikrT&7E&r z>Q{j`cfM((|vPUN0 zxRw8>6{gFn0mc8LGf@(y@lNU(QSB1{?;E%F|B}Rn@dWPw*#>M$vj1m;|CgjmI+*}} z$&_G|65%V6|Cb8?FAe@*VS$PJf2J#mvBc$xxAFg$E%yJG!T(!E{@+IK{}o(Gh$Yz3 zHvGRL_<9LNLHK`bV@bF4^K@fHbNM(U^mM`#EP=r}_<#0M2mC*W z)-mpMtcU-%A(pVf|1&d(WYMI+|4a4i&$D5D>O%i-hQ(6h|1I|S5@mnSfRFl^!xLsJ z{3gS!UN^&^L+}gD)7%d<7#8|}alQOb=sCTa`&kK1sp4`xqx3NLh7AGuefn&>s2yL1 zyk$5GHN|{h?$0fFU(*JEM%$$La|_r67KMK4)HaaAv zdIwS+K675G-;}q??@V(<1|1F$+?-jnu|>3|xWeYut0M7&RG)-totWkfFHcJiXTV-^ z8cRf$t+||zy|fIiPrM9&-jPxeF)4oHNP%zfFy2{A#k-4fQ1T}c=VyTE9mnB|8wz-Psc6r%p}?P-TfpTrb2mof!S(#i z&o0!JWjxN$?WBU5*5vz_=jMZ7z-{U%_-V$Fcmi}es436kH|07a&OE=E!&jQ5IJlo%R1T&EJ37iM8LLto@zr#LiyKRl)jW-RRAq%J1M@xWLzA2y=etA;v=gc$L2+ z6TFJO%9}s&h5q3!{(cPEq`|kaH+l2#e4)R;#@~@4yEJ$WKFrO(^@V;czw<-3Y4Ezh zhHif3+X*`@_`8BYwDkA3>Gp#9k*)(>^?UnuWS@4wcG*Pyz?p42qQy^cjuvYjN}6+UA(R0$6Y#-cc1j@DnLDt`*}TGuRUdI zr}n=n-~>6^2dxIJ0Nn>#3EK1RF%#{TlN{OUL{m4L`2Ih|?Ik8&r(>N-zUFesKe^vR z=Z`9%+-9LNCdd(%V1``m=BzLDj=YaVc6bo`NTDCe`$%Mm2WcM(_m}+sI6KSz;)lLH zcuTu{H`;JLxzkbun*QYLmRn&MH0Q`(Uy2)4lI+QsEwq=Be)1B3A9WtwSH z{EA_LZ9%*`Wh7dGboEL2Hn8F8-2@dss6xCFd(DI&R1$vh_M3C~K@*nzTp$01c|z!S zzk(kiUQPSIpcSB*z7P7mct1CBIbmSl3z1@oa4waX?n0F6& z=T%me-Wv4WO??N(P=S8ly~)V?S)l>% zs^H<*sl4Y%(k=IMF(KJM%KO>D5$xyw!x-P&<$7@pbL;Qrb;)%i z?eG3rUYA@S(z;xtci0l0!``{T&w%iCeF5cvbdm4x@^yVtVEbQt#dPpBW+I&sd{ovY zr31452ppAZ@o`y?kp8KFuT44>^Oei^A+r&(s~|(vee`3I_HI4LKjuDnIm(uy>_*V_ zpu`J4DB!#>bZO`#k!WE1`~2Kh@xAYJnIOsrz6G}&v=}siGJeSTL05&|7KysIU*hjW zdG5W$W%5zhjk11F4`}_j8%z&;dmqOn7GfE0$S%bwyycJ$HClY2o)F%9M45U0D8FC1 z6syfdSyxD~WP_%MJ~n?Y`~hX{*r!ef#Tibso!ZIxIrsz0vio0`_jT{TWIq3=^Y}w9 zFaX~I{9z^1-d_-ZX!#}K4^2z(2k0h_KY+@*=|2;HnA^Kuia%h@&h6bU#UH?%+q+$g zKY%y4ce@mS0B>&Zb}9Y<-rU~pQv3nDF%{ETia&sd^DJ1c97A3F4wY3R{DGTt7~7k8 z*^GXPG9eR~|8xrdH-&-4#DD(>mD2B@KZ$n|%~tpTz4&iLtw>4Fk=nKne8#;j8y|Pi zuC1-iE-cM2$Sa3$g>n#U&-#t%2fDsHm|xw63r< z*wPYcZE7wo2?o|Rh6);s*R_U9ic6YHn}dzH>=)@oq`UDrzDVD*sO@~}a&fSb%DlUps=W^s4!Gi+*%kYDhf0M9{=ngA7{b9F2WdD_`~z?mfx+t z-FI<;@5|-AaxLojcyIT;+(F!5`d;qR@88m+?y)Avcyfl3-AK5MK-JSJaq2}hcjzcWJv8RXS%l{ogep8Pufmui&gJtwB z{@zd{Xc#rhF`3@(d$+gCS3gQh-|GI&cW{669o%2j@8H%~-e0$Q`&L-Hzl0y@xda3( zfBq{~A}Uq?{}6@E)aQ?yCRutEsq*S`PNk*P0DknGqbT0B=0ElMN9yTc;_*&ZUVYA~ zv|TB$@>O{yn~^@Ix>dgVTvTb-5_z@%Dg}(lviXepE|%wj{=EE>?tV&sC*+rwp{CUK zDZcQxv}_1W$}3n{2ys; #else -static constexpr ck::index_t MPerBlock = 128; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale< +static constexpr ck::index_t MPerBlock = 64; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale< Row, Col, DsLayout, ELayout, A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, @@ -169,7 +169,7 @@ static constexpr ck::index_t MPerBlock = 128; using DeviceOpInstance = ck::tenso MPerBlock, 128, 128, 16, 16, 16, 16, - 4, 4, + 4, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 2, 2, S<1, 32, 1, 8>, S<2, 1, 1, 1>, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp index 5682b38918..2f8e854514 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp @@ -409,9 +409,13 @@ struct DeviceMoeGemmBlockScale { hsa_name = std::string("moe_bs_stage2_v3_128x128x128"); } + else if constexpr(MPerBlock == 64) + { + hsa_name = std::string("moe_bs_stage2_v3_64x128x128"); + } else { - printf("Faild: v3 only support 128x128x1288.\n"); + printf("Faild: v3 only support 128x128x1288 or 64x128x1288.\n"); } } }