From e7fe8587f6d73f808dcf55d60b5ebff993cd9bf0 Mon Sep 17 00:00:00 2001 From: OscarXu Date: Tue, 6 May 2025 10:47:29 +0800 Subject: [PATCH] Add asm for no-loop v3_128x128x128 --- .../hsa/moe_bs_stage2_v3_128x128x128.co | Bin 23312 -> 36856 bytes .../impl/device_moe_gemm_blockscale.hpp | 48 +++++++++--------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/example/65_gemm_multiply_multiply/hsa/moe_bs_stage2_v3_128x128x128.co b/example/65_gemm_multiply_multiply/hsa/moe_bs_stage2_v3_128x128x128.co index 128657224d02ee03f42dda36feb6e2fc308dba86..b402f82f335daef43996a49ef5ee0563fc9c6e40 100755 GIT binary patch delta 13327 zcmche4|EjOy~pp~ot^AXb~n3A5|%WEWm&>zg%BeDj4=d?P;!tWT7*2S)LNJg6cD3e z1ZH*(A!3X%%k!vJCmCwMW?Zz4y+BY)H0N zd)}UteCFQYotZm-zQ4Kiy}NDCS-vo6TbO=v?*ox#n&ct;lOi$87S!5U#LL5k3`^&l zdQncX_j)oy`g#%9x21E1C=y-a3lBW$%4f~bn$6EXE8=ILO;lKleHU*fvC>i;ytrDd zbz$R)7c7g4CS{506eUq4ta?~hqKXz-HYIYT*Ppq}+D@^sz*3!vN^2}56CKuJblLKR z-4<|@pO@aYc*V+_e!TqlW%aWYlWc{SNGg8FT4ISO{@42Vgkdd8?44)Ju+pVi>5^0% zw&I&uF_d`8c5Pzz^|I!(qE{nC2agm9j%59`;`F4hk*FqnC@u@?t8+v}wOLQBqF(P) ze<|wu_ScGrYy5&bYflt{ldka^cG21AMso+G&xSUjo^NQHXxA}Z@Xb`cD$2)l1+OzZ z)YWG@XinI^7z^;H2bQzabdoDOrjq(%*I z$X)?L@)O;bg2YlKM7zfE#obD-k5s3((&w{~Er8lNiCdLHZWc~$qWY&S=opWxTKXTP zG4ZT&b>g^E-s>;&49)(NWC=aN*BMI+)CF%*2c(bixC=d$D3M{)Ly&>b!?pq*{)xa<%m}r zSGH9mHc6KC%d@iLcT=(LO1q>UCSsYjPv|)=ZJgA3NN-+WXW!VQ1ey;jw&qUaQM%f& zO`XbJ9gLFP6#$7 zb%?}{*n`DHzu_%~Xq4!4-m(+Tqw8}two(oeE4%IMZLP#%Y0nz&p6##yUD<|i=FLV4+TFE*PFn1aV1#e~LB1LSt zC=gu1%7y063k!p#cDtT6QECtHoqSI!0&PpGB|qQCBx#K7h}#THn`G4=l&n&FnP#); zV=b1ruFHDPAy?dEa?CZz)t>iw-u<#yZl}IM?N+k>gtveaPo5`c zC6*I6mpCO)+|xElmQ~LoPt51>w0pdsm`9!-W7`~eNRCf5JZ-Ym+3v)FIVYCsvS(~O zE9s1}{H0Ay%6!3Tc-v$&^gisp-}A8Ney7(NlRa|Gxxv{cZ@`MO+-{Sm=@xQTT(**S zf}y+S{j+!w0`hUOMQETU+J0 z+&Y77I7pqAc6R>|a`4myq>Wg8RXAQ>8os-}Fsvho#Os3L#yTw!ZmKH^+vC$)hvC!) zknD=VB5)+=2Zw<9pe1f;9U85S431Vs+|lZY_qMW#=eE*F&TXS2#53pgFgH357OxW= zxmIw-6Y_GE}mh0$Zsm_MZ*c?Y8nRDF| zXa%hx1t~~8=TG6uF`H2$q=peskm-!*$5ayF@hV2dD;N=<0BWp3IVs}}N*U3hm=O(% z7!l7;9wsAMre1}_L){j+^1KvgNoOUw|3nxWaF7n!t~Q)s&VYqAZ2yO zC}}?PQJ8*wTs4lL2uiH(I3-PI&W7oSuc^jsuLosT_ZlUQXI>A}H{Mi@H~$c{vpPKC z<7ZBU>B+ZL?YE*$!Be8H@M+W?6Lr3AqAqu*B4rW1ll>vjcFrSvPtw0qWj-i@b3qxr z4zz=sj0FV?cF+k@&<#qU2TW30yb!0p@qBLa20}tfTpl)7QV2a^$)nO@ToSkYoXt%R zA0J1NlCV>ZD!TP|F2`E-6XoV*!R<)cWg&xuO6tap=f0wam_C1kdojQ`myl{k+{p_W zaq)W^6HDmF*qJuSf?MSLDJ(lZg7$!*R3=CY(V2;7C~ly`$0xiB7Vc@%e%;dPA)8Nn zhfwMvTTXhvLumog(C`Er6zpztv*0C^x>;~2rGB=N*AesYYSMny(&=JxFQqOPA4F+B zi=Rb2e@By(#qub1ve;lsb6M;o#B#ScX@^?y@Il-`slwvfl=@iw6T~&&)+PrFx+rz9 zU=HgYf}bJi+0w+0-p=9{O6@FeWgSHP3&h=zHxU-I;g(^sEJ~55y7j;dY*5#&7w%)P zBi(wzE`n3mt^2pIfnB$r`v_D4`r3(xkzcp&*@$Zek_J*1n~L=L>w~e=o)AWY-8vmh z?9jPm@-XOp(%bedj~;u=9kTOS2}XzKy6uxM;jt_ku{WI|-U&JkojBK>HF*e+$u`7Z zQ$oCBbO<_ju3Mg*r}3aoLh!gf#CuD<=q(b?WG|0NR>Y3UA$|lnM=beBlU+P!rHCDo zLj0(x1xJO%G}*yp)PmSy8sbMvcUmln(aAO*vk;G7dH#snaYQ4kL1=IxOC;>1@I(^h zi6nK$Cdo1WyeCK`i`ba^%YbSGg2A%D%WKK!TUBH0_8`w6Te1G9J5*!GE|EKSp#1S} z)!4m9_^4P1FCW01(7Tcp!~s$s`27Mkt|-M zWZnCR@dQAgc#K_hmf^3GzGvcDx!4GntJ;rexYj18DzF+%ru3;!I+!XJ^{C=WXiI zK9~J;Bn7Vfsiy96EL`_E%PHOeBdv3dxhu&oHTh( z=_AkWJ@UNvB6&8dMq^Wu%d-)J`LtOznl}o0HlzHp)}k6MtwH`=Z$ZU}ZK~1MF7(-k z@*A5}W7C5|o|{lU`H*Tn^oWq>LnwdnsA@d=xRB?gl&pP!tB~hfQ=Ty)yWs7Dp*sZ& zekxd~-6|G*KM~B0rt5P8-tfilSD337RYT%I78C)kye=l1^OImM=# z?_#Dt>(lj#7d}B&k9qX72ly+k3K6*eO9FDbMwgxQ=S#QN@~q(U)V7?g=%vpJQ=b(neSRBxKAJ(E51I1RXg`n8O&9cekv!Wn z$g?GbJe&KMr^X+ljRWd4qdb$4=R%}v_5_`pCQ(NpiEiwX=v7>zx2VP~i-KjhysY67 zs;Y)s7vvgMA*oN7s>ae~LZeGj{&Vz66yy>GxkNQABwsBLv(v~QqZ|9vsW_b%9;0RH zGKF)_M1axKYxrq!biw{Q>K0{(}#LS#(q<#elF7o z`H0wW%GA$gx~XR*>^EiV&u04fweu0M-_)s}>$I(BF702ZT&8LUncgz+Qxx7uXX21D{vY|BdGi0i-}zZDzmw2(zmxf&-0zfVS2ep{ z);@k`dw+iCboiZBrr#L~G?Mxe!tZnzcfj-1E=}`0cMVJVohi?RjEH?elVtcUE z?<@PKYG2XumNPyv}A%685=p=i7pe(u7uPK=Sg>4s>UADSI^!Vh)94|Q#H zaX&PBP`2>=7W|qA2h*z z&@3u(AJkDGn4K@^@=1EK+Rc1TlA~DLr;=v)q4ykQ{;2}nvd0s}`R_4BM^hw*DtkCy zC5MxA&N{a8-BnW9n^UEFa;nvwoE54&XQdjScU`z~UQM`Z-uJ@RxY9Zb*VW>uWGrJY zg^dS#D4EFU#75(!Rhlt7Vx2K7LT4<5H+sYAp>BzHJfJCJVUjF(IgTiDQ*~s-O;wT5 zO*f+Pb*D$YO40bb(_;?^PIyjm;y#ehhmndl`z`v&YvKM$g zjrsmKwZSYGz$DFw**g0B`=o*erRtq#{a_ywH&*>X`GJ=U9&^gLB=vAajNX zfy^29fy`7L0y~qP)>7u3qYPJfBfED>nTL)t+}$R250x@69cA=C&hDjBpN5J6-@r0P ztXRj06&5j~;myo+#SUgOB0ilF@hV2dC#+#dUv=ayDg7O>?{(zJzxMt6zytf&*_tKuk@U@>hq^?+8!q$PTM1lIBgFy;f27Av8fABY-liQ(L0 zZnKhCqmN}XH@W7A81J)%SwG5_7^nU~aM}XFimL@HuMsT2N^soF>@?fx7#r(8W>xmG zu`alWjdj7@pdH)=D&P*#32q17;1&%F9xVKB$;|8?&-+TzsZmqJ&Vu6wLzib?@VrNh zy27!jOL^WSL|tx4dgmUjM(1W;mh%uO&HS~y9h7JOr+X8q*=IiEma(AB{JGl>I%huN zrl5P~&)gE|nR%}}nex1M<@M`%Kgm7sA7pXQdryC!x42Jmd-wIcwa$3%tPxBO+aDC8 z_U-Lr)V>|^Sjg&osUp7H6558H{oeGvvCKVhkbB-B_q;*jd3RWa=RF(j*Ynnx=?(k% z4m23Z^KL*aljpq-@qs+=Er@0EytS=pFp%fH9kEQF_YTCrrRS|N(_6d(Y{W@puK>L~ z?>6tio_9;mz@B%rYhcg2F}t_ttufR4f}y562KKzyWDV?jueJ^BdDmMp9>Rh3=Xvka zu8DTcv2f43BkhenCbP_zx?_`MF3)Vt^WKTfMs^On;CVkeB5H(2ij2m(v8^O(gfA7z zER6CKqoc;CF(Q$TLRlLdHOej%NlYt4#a-j0#<+s|>;jw{^tpzOGeCnme&QjxeE!`%e+PRDmd z;&NoWJ7y0TCj4BHjPUgBZNl?QE8GK(x$vK)*$3P2FcZ;AGZC3CABHR!9?#_bsZ7bI zKNG&ebom;3neYvoX&*H7arqid`=Fth%hzDq2Mv8(z6R4iXz1ngHJI)}Lm!u~!E_HA zOp{Mz6J`yjebA7y55A4dSD7JaOu8s%i1{u)Q8q3^(kM$y8tFM>REC@p&X6-gEN5se zG4wN0mdS)S^M#(4FD!lXh0>ER)?bt_{-PT1{7;aNfV6iY;cb6ajd%Yh9KLr^e&T)A zc<)S*k9^)kS^H2m&VD2uzOyLb^{Hxn@|ke>K0$fam#Xo_--W~X1kYA-}1=rxLFJ=O?F~5)~D{6|C$QEPqLG+|hJ-GTjr0RapU2a2Y6p zOF4CynUeoIGhn|Lo*RBl-s?e>;l4 z;!MuIEX=MepQd=FziA3wUtHb`c_V$AVmq6r(AXP}IZv_un+`O-#WaQX4d*HTRnrvO zH=L)yG(}g2yzyd&ym6p+-hlqozLB#(L*m%mJ8|^M8@n^)jU5^C#^|k?=P7W1u`|R! zD;$Yk*A$h9G1g`usDXbT#Tx?qe0n~~k_K6A25>j=&k==vQQbTDcN%X=?IiVk3bO%w z{}W%yA7QQmhlzbg;;+Bo4*auU?cam+g5Mfw7ya1aO$7LF0s4IBpG{J+A1AOc*0E1D z|4+z3KSCJjHv+JYQa?_>7M@vm?IP>f{F?qIIw>o$^7>NW3ikDXd{VSq68o+%^`=g8 zigOYlUO&cr#60yWE=-J_KjxCu1xqn?!T5F)-3taK+U9$Fm?sb)lM;3`e~cFDHY9e{R^`oV7PZ^V8;+OQmj4esA+&@5 delta 1141 zcmcgrO=uHA6n- zW|OAx=B0bgNu&!31u;Sn^(2CbRuF_76g*g{+C#O@?95c!;?)PU-^}-ZcHTEL>kD-5 z37I=i&C9jH8I;EPiylO z(~;pK@59#E77oiyN8m9AN3<|Bq*AKVCpObt}gu^W3gF0D3ct-_3lJTV-@$Wbf z#waiW0^=Lu#*q)^hi*0?!9RJ3h4dVli3Im$R?^KT8GBREHx|p+?!LV6b>YsrlkkHp z$p=3eZf&?qI&s6$yHrJ~k8tA)y%(Qpx__~Kkz&2$yj2p8#{!@c!`Gx6Pel7^DIHA5 zqA^@I4x(ue`qcUuA}#+@*QUea^z%ML{TRY{f5ti0&o{$N_6}>jDQ4e)Z{Flsm+?y6 zR=75K09R*T6@y5sgD0;QkLJev_z%a`1OGTLc3l3_>~)Q)bqGH{7^lk`j+B%3mgKUd zI^e@{(ozj7Jd5wkDXS%e6y`AQrfema!ZnH?++MuxTK3julR{3j>Tb$XPgOo$FOMs3 z(om9lLugkrwmA_92kOCn+b7CaEK6NI; - // RunKernel(kernel); - // } - // else - // { - // const auto kernel = kernel_moe_gemm_2lds; - // RunKernel(kernel); - // } - // } + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 || + BlkGemmPipelineVer == BlockGemmPipelineVersion::v3) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + const auto kernel = kernel_moe_gemm_2lds; + RunKernel(kernel); + } + else + { + const auto kernel = kernel_moe_gemm_2lds; + RunKernel(kernel); + } + } } #endif #endif