From efbb85be2a9545793dd64cf87661507350b65c3f Mon Sep 17 00:00:00 2001 From: OscarXu Date: Wed, 30 Apr 2025 22:05:54 +0800 Subject: [PATCH] Add .co direct asm support by CK_USE_ASM_MOE_STAGE2_BLOCKSCALE --- .../65_gemm_multiply_multiply/CMakeLists.txt | 13 +- .../hsa/moe_bs_stage2_v1_128x128x128.co | Bin 0 -> 36416 bytes .../hsa/moe_bs_stage2_v1_32x128x256.co | Bin 0 -> 21952 bytes .../hsa/moe_bs_stage2_v3_128x128x128.co | Bin 0 -> 23312 bytes include/ck/ck.hpp | 3 + .../impl/device_moe_gemm_blockscale.hpp | 199 +++++++++++++++--- 6 files changed, 190 insertions(+), 25 deletions(-) create mode 100755 example/65_gemm_multiply_multiply/hsa/moe_bs_stage2_v1_128x128x128.co create mode 100755 example/65_gemm_multiply_multiply/hsa/moe_bs_stage2_v1_32x128x256.co create mode 100755 example/65_gemm_multiply_multiply/hsa/moe_bs_stage2_v3_128x128x128.co diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt index f654550417..6a1a2582f6 100644 --- a/example/65_gemm_multiply_multiply/CMakeLists.txt +++ b/example/65_gemm_multiply_multiply/CMakeLists.txt @@ -19,4 +19,15 @@ endforeach() set(EXAMPLE_COMPILE_OPTIONS) list(APPEND EXAMPLE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) list(APPEND EXAMPLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --schedmodel=0 -mllvm -misched=gcn-iterative-max-occupancy-experimental") -target_compile_options(example_moe_gemm2_xdl_fp8_blockscale PRIVATE ${EXAMPLE_COMPILE_OPTIONS}) \ No newline at end of file +target_compile_options(example_moe_gemm2_xdl_fp8_blockscale PRIVATE ${EXAMPLE_COMPILE_OPTIONS}) + +#hacky fix for bs_moe_stage2 with rocm < 6.4 +add_custom_command( + TARGET example_moe_gemm2_xdl_fp8_blockscale + PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${CMAKE_CURRENT_SOURCE_DIR}/hsa/ + ${CMAKE_CURRENT_BINARY_DIR}/hsa/ +) + +target_compile_definitions(example_moe_gemm2_xdl_fp8_blockscale PRIVATE MOE_STAGE2_ASM_DIR="${CMAKE_CURRENT_BINARY_DIR}/hsa/") \ No newline at end of file diff --git a/example/65_gemm_multiply_multiply/hsa/moe_bs_stage2_v1_128x128x128.co b/example/65_gemm_multiply_multiply/hsa/moe_bs_stage2_v1_128x128x128.co new file mode 100755 index 0000000000000000000000000000000000000000..1e6fea5a85407d2e8d86877f68c926706276d2c4 GIT binary patch literal 36416 zcmeHw3w#tszHUvYXEHOH43i-VA%t-lI$<(~m_)#Uk%Sm9WJWQ$#L4)N%ApF3#qlU(}a&(YmU-u4PSg8@oHRX4Q(-yRN+_vt{}6)eV%eI&f9PuMa$sxwfWh zO+)n+f#&*qYd2K$*v~UptZG@)TD`hq#ma`}w(8}9^$qpaRNyxS>%WSG?Ua_WTMZ>+9wKg4!98F!>(okKyx_WgRj+RrsHoH1ICwILJ)h+e))lDret-r`@ zT@_ec)7CIl!ISl84`i+$YL#V=h{)>JKvR=!sCzP3Z&+E|(sb1Oz0Q?+Mg6~Iu36o% zy1IS?su-xFy478A2+cZlnqU9r5fMA}JpY`zu4Zk+@>MNpUTUX;|DJa&?X0cXu(qN3 z?|947zu(Ee*FF1R|G4*T!daV;wfzUSG#&t1FfANQu6eYoNs?eBTl&OPg0 zJLPoW=eGTN&-b}+56LO64O34;+s_eOQP(^XKml{##EC1Gub(<8N1qDynZ+RIS4@}R z_@jB;8HEWoRqTGuC2#^@Toh(@xvtRX--mI$IHVVh5cvuOFouO3r3X(-KNsoz_!~vW z=aK|2mB(SH<==vIp1(-s`-@y>y{4xx7wHG$1y;)Q_MLW|Af+=a)7MSaNI{e95Xn{kp*FhS@C*mo%(gS=iK4cg5Vgnx=+J%jQ;RpWoKdyt-vobyLlTmNjjY zRv<@iMa#O`HH|H+W|qy(sm{*6xTR^$%4R*KdUEBORunFodw%uInKSW!QDJWN+zYCg zFE5)rx%$$Ql|7}J4nIX6zNNKcRZSb#HQ6~0O$~evMRm%~F0Wa=y3&uu)e8U2 zxrNm^IkVR^%_!vco4Fo^+7>hfR`}aW%jQn4o>JeiHc;1)olDm-x3!@zfQ8mo{ID~# zW|jtWCQrft*%Wee&-OphM*p3kqyIOF^5cT5bLTc(xu&7H4p&$jpri0S*|;d|orUWY zXW-|f#JR=QI5U;jk6KInvS!Y`q#FCab+VsS|sP)ppjp?@*|(I zt*y(Rnq6~dU;QB|1tRkSy-cFMaVB1bq_>oRW*rzHhQ^Zukxwt*H|flBddhTc{QGy5 z56;(fpNTXj{M&bwFSz2&a(W6)Er(Alug@oXo_{6i`t^?q_}2`I(TRF=7Uk}j$W2v?c+;uleKRd<>?vU6=I6p6Yks@QOs$Z zkVa*aO{ULaQ&bYI54= zm={eWlNvRL%^P;4XwmGaz0vG?gr43hn0kzsvFde6j`>@Xrf5AjyC)bbf5XnqBO+}x zyg^ji7Uw9n;m$mFM%?JWiJye41@O z?2KjX8K48$3$z0d0&PIk0%y0?{QiIibuj~@Q5O?13UwK2FoxrBerMFS8=X<+?x)v8 z`=42JvA=gsga6=~fImLY6-+RU3mUPsg8LHOOA^RP1u`!ecIl%>!T;zy>a2V zE^l&j$bg%WYPb92`r|^$TavpBTMS(ZaS7eYhPZAE%33GWT59uQGtdMy0TrMEOjZot z_GEi^f-@l$7v<~@JAsWX>qrf&ov8^{YsiKEH4^>H*)y`s8SdW69E;zM&QyXd z9_Rxm0n>mfK#vkkai#>vjvN~v-xJkkiZ_MqVf)UwuxqCwY}grfYt;2rKFVnb5{|PA z%&`ibWFjnoyKi(9$NnUNheio}HZcm_(sa%EsHjlXwNYJ^H%f^P(!IT*;E^Lrw&GYt zI8nh+(tC7@ld({4nm3%A;k`E3>y0SdHPjE7!FeE};pvUuj_H1H=k$8-IT!kTQHP>JA+yQ5Jx|qw zN=AE-?dlzQVVW=T!VF*hg5Q-3HArv4KBIF@VN61B(hA;_X3PKh_4#NA_+!*yC!p8`oB7BZ; z0^tjUKO_7F;cJBdMEENLLm34@MKB?l5v&L{1UrHQ!P!1CMr~Bkj}^;NTwgrW648FW zM-@9`H%28Po{W%!;A)?|!reG&g{x6rVQw_0r!?kV=} z-o}osEUhCmQ|rh{(ULCM9vF4Oj)47woq@#hDUES4*2Wv7G~`cf@9=rFDD;UjCWUpR zrD-qVc=47?)nAP74s^IZT8GQ6b(mdR)MT?)nPBoOEHa>6T#EfS1b#8@#=tMm*%SB; zufwHkhxL}!i21ev%fMVg*8pS5%g7(K>nBXvW%NgGxjjG}B{`2IplM|>8=fzxRJ zuNh0EIP#a!e#ECx9GDZi?STM09*JzS?>MNiua0ECsVKzRS3XvP6wf+lRU@~(>1Xd_ zphtT1aV2;)`W@}_y+Zq5@w1P`zQ#91nT(@W_2q4^_#?Od&d)v<`>uFd3DVp{*L;mE zpY|K8x(?gaXB+VlM#uCHSkUga)94^Z~&%5w8K=94lbZ&I2Xz&-7^vC zxTmpHsS)Ez)t1-XFz6_5j*NVlqlzUu^L>=KxEwLou3pl78(?Mkbv z*9EJcgKcPIW$^IWwLo?7^|9-Lronf{UIjD{emHg$&^mZx>@`5!;MZe=K>MKLY6dz6 ztu9^{ESO}USfq2F*KHfxLG%sY7r^o9ysm4I4$h;!IQP+h$&{Y4LDqe@tovSB_Xb(_ z!0@_vpgoj9r@a%X4kp{T0!@P+`*xst&}ZKPv<~LjcLHsL)9hV9`(UAc8qhIVV$Z`o z^~9A?jwkLMv_5h7Uh5P09J3mjGsJ>-23Yv+fEn}8z)zxrgojHo|8?9GV4YX;^dpl+ z`jeBQf?sW0;g@}y_ibFKGjiKZKl>sQaVkyiRQ`#a=%C)OKYm5@YY*mdy6#WWM|u9g zi2mupoUZrl-?+%CZjycZe2iX{|E(ETHAwv! z^B=`uzSXLpf9uVD%18ZpcCEg^)mmzW=Qo*1YPqs^4TMfS&l>fmAfqd?Q(OZEdm z^WY)-!$9lcQ}zdew!ug3_XF*N_uKCUItK6Hoc;K5!P!rLF7D@X$=N?h&c3l(q`w8; z@H$EkzX}fPb>W=FF_mM7W7h)JW3R`q2bzw(6MGfVeC)&6O+f3h6S3C-ZO6Wj4Fc`Q z6nhxxIQCTR?3=H~{PES1k5?*+e*M#7lcKaw(Lbi|TIr{@`T}bMN;{5mLC-g(Lgf3y zRMAdXV%?y(KjnRc#srKJr_Ljk{#Ld~KXpE#^p{iVKFWDRj3M(W{Z;g9O2-(IEXR;U zy$xtg<=o#OxxYbje}m-y2Fd*mlKUGZ_cs*UyQvK}d}yO{e0(fQ@3+oV`|YQLF?zq9 zPU$p9QU2otNEiLKknknUX_WqlS7P}6dZS<9Th9x8`4NGyJ{~jd{_*i|?@eO-+uJS1 zzrEYV__sGK#=pH+iScjmDlz`;ZM5&9y6>&9uLoyot(mi1tTl52isQE{R6ot}%FW}W zWnU~IqYB$(&9@tD zB8}!*N~3vIzwgBTMvS{Omo@=uE?o|!xzrD&xwIBYb7?)0=F+t|?^0~je7X!s^J)7_ z;vAWnOX(ame-hGsNl0^K`-`$};6A0%9N7MXtQ$B^X*AER!SVI_(KxXP_lV}U#Xy?d z<^ySNs{+#8Rsp2BtrAFcTN#k%wsIiNZSDUm>vqcg*8c2{gHD_;6Y~sV*0vH+Upaoz zoY#Ix};oAyY0{9Loia>7^Tj1MvwU_E%uCO%1QrZVZ&g2DELt`Y}8&891XDv{o z=JC4Tu1@3iyIsxY@xAI49^b1@;&FqT!{Y`uK4D?lY3uEDTA%K8nxE=)nx5=*s!wz} zmB%`r?9onVE2Fri2!a7C;CSx6A0;XUc!+linlo2LIW4Fr!H3_Um8=;`U|_Vn!7>FL?J-P6<2>FEiEJw2PQ@$?iFa=Xu- zXMSxB6}+T{Hoc&Qg1^*49nWc@t%tPGjy^5a)vJYWcv>r9@BX&x289(8b}6igaEF2i zDBxCw6%%$ScyIy+6;?`!>n$TJfaMJ1L_WLy$nvR*l0tZqqR_Z7owl z(RR9`#1l?L8lH#Tv^`FN;R1;9=VH1(WvZc&W42)i$8kT4J^ptW*h}{b`jEjbVj!6H4jx2-g|9QXHw*;5hZ(PT!As>O7->_EB4#-<1y zlPqv_lE6`k0u$l|j&ur)a{w{^^{dmXsD4crB1kAbuQ4$i*09^7r?+=V?E-0f%I$l3K* zd3xH`4srL=U3Y=IcW9xueOhS!ZQ$@N;P8#$@D1Q_H#pp-Wf2F#)f_@_brK=CI)xBi z%_Rg^rxC7KSRP@U0;2&?a`!^;u0nFRLUMN+rAh8qoWk7-$=!-mxLYB)TLJD~2o6<9 z?p1($(a8~{D6(-c#MtWc$6h@9AG0k zzDO?1$lp&7$^*f*Pf6}QD!A8nzvSK>l6yBv?sZGd!Y--Elk#9gduh`&=<`;?Qs1b;Ine|?g_ zX_CL5zm31HA^t7_e;0zkmEdnV_*)A87K6VyI_&cR$J@~s;3l+6#q2g6qsI*lJl{agl$|fyTRjY*-FVkGa!^H~o5LRKGLx^=q z8X@L%FCpf1A7QZq(;BcyVVQ)IzvIExJjvfY$zKnpN&e=Y!rwf}-@H@!n7t*`CB6STPXQEP4f2>mB#L|`5&k>c8|#)RB7xUoqtTFv3peh zn<|an3He7=8oNj4Kd;i*9hcvaXTQIF?488g9Xfw=So^qP{IyB`!pVcL5Af8=$6u@- zPUhp|uT8EG%yNC8%Jl)`y?)2dn%}Y4?MKG~f2XpZyy>2v+-aVkNmD#MSvj7bj7(3D*XQZc(mXwG4_`Zs zn^A5Jxy!VWR;q=(C0Zz>SPNy%(n6Dpv`}uL7RoEo?8L)7oR=_H!TkVEQgDBOSqknK zFhjxp17byQj|Et7&-=UgqZWaQB)rBXhj*p$I&YpBruGp z0rL9;){tKxa3T5q0p~Cy$4glh#~BRID@dP;`v6SKVYtl$!6YB~3l0P|5Bc;A1Rcp( z&jUS9hTA?6G+P-?j5%LnLEQfl;NXb-Th&-#Z2n#~1{jmSTeSc!`MdBu`o9OF^RH8l zKx6(kH3}G&znOD(gk8-4u{MD*R)H3?z-W^|qbe{;i4Pi3-jE;SK>yCc^Qmu0|DHcw{|+wk264>O^>0}2|7-fU^>5R^ zv;T|w_msM&L;APXVCDLEFZg?&<)QzA{%sESW88FN5p@XY06q(}1D^xhfL7AKp|^AW zn``jUzq$TS`nN^u-cAoOq2u}S|<29o{_`}V2&w>94S z&H6X&=43z5vI{)lDsYmCu>9@k(_sq){?aS(UndBBA?-W#?>SQco-Oq691ljbD!(^e zRqVaCsvI__Dz15x{@uT%)PHEna{seS%KXnQDdD>J^Dhg1`YsNJ2?Wjx?}xU&AKLnUXz%->y=PZ@8nde1jp@~{ zM&F!@#>~r8eE$<;LAvD;lvQyKQ@ds{H+nBEZ|o>5(>h8^wT_Y!tz*`#lUldgtaX&% zlsaL-oV(i-US%+hq-TN~?Yw9iXvjEb>(BRAd6wd*LZe>>FPOV$Q@ z$FB3|-g+)Nr-O9vbj=vcS@ zWZ1e7lR-0s9q89m2mh7S!C#U(_>)ox2Xfu}=cIGH|9TWSh4c#OA6y5={xnKQd>Yx~ zfgalb6k}XNK>keHk9ZF42l@u0zG#np0S(U0m~Kl?d=qOU^w)vUg(i~(n>zWw47_g` z(#o8$ttrt3iMt3&s<0T z-EyI|xnPUe^ZrMf$m@n(zHjqGJm0Ie%8*t@>+Sup>%j*8U)H<-qi46D>XZ18(7Nf~ zl`IwKO@ghyF*vYV+ z)BVML#*z7bUc}n~wseZ)exp^m#skapS4Ui%kuL7*$Aq}AUr@Sxn@GofJTC9+8@e7q zZ9y8hA~kMRYTPENahs*aZIv3g4LSg6Ja*^+LhHVq>xPL}3!NefIv}0Xxkh0827$>d z1*Tjn(A6N&9T={ibkltqEOB)K)xkp7PM~RUnrjEpJecF!4zv#XT&+OcpoeSR_Q7Nq zX)caIC)c_YXM=}se*K*9;Bz6>DQOPc0qJN5u9<H^5!Hq|)Pj58Fkly_t_ZaDd`k!53-Fxcfq`wc(o6WZ(5TmHF3D-gA8?_3qD~5q;N%Ih(Hi zL_gXDebzQ z>)^6p7p%e$l1{4}E#W>3>d`{(!(ci(jr%m1kLGY62kTKE_j#}#Jrqkm5cZ=_#gZ=U zIC_8VqMkK;4EcPe(72hQ_vkd(TB!~v&`(~){abX;Patw)48av{fkLC47 zAEG=fu|Cl60l$vfxL+6_PMs$x{T=EDNI!MHp!Cg_<|2j{&pO7PzJzXb1%?+V@450$+b-xX#VT zz8ax#t2IL3Hq{7y+gu~`ZEKCtw{0~--?rD(+R4|#QFBt~_AL=>O5Z}oLG$^Gw_g@b zI`%=V6$zigT9Mj{wx7m&QQywSnvv4Z#hQ`N@hAIhraspWY1~`42`#&Gi@=V}0>jsG zByD^2DcUy8$J8b?59``C&A*gJ^RBLK(>zOQG`H&7HqED$MsuisuZi=dWz)P#S~h9h zq-B$~O-(&P&g*aaa@_86zpJ7t;20ab)t=>0e zpFk--rCApS_|lPq5DbNKF$54?bF;} z0;IXW6i9RbEFjJO#lUZx`{g)MDCgVxq}$PX?%yoi66d2l=%0nu{g z53JD3H_JS&G7skJq4JG#UlaDdxN+OCw&mBeZCWh9o^6G({Cc+eV)^xK%ZcUJvn@H6 zU(YsAY$2}8zOB|?1axex_`9xUri1gemMH+zT4oxM)-riOTFXoU(pn}LXv12j6G&^B z4xql4xnJJr2XUXNjnR&L4GtV?>-`Pl-rgG!*nGFZ*84;mw4J_gA8Ox5S*8hP4sHw# zYs2Yi3nlQitpKP7l-OxNQ{Y2e9?%>(VVeT9241)20&RhJY&k%CAlW_%=md`U`Rp9c3CaepF)UEqV+in`C8<=di-PIPlA+OYW5dV)ApA*=!=FUA{Yi?y*0?{( zAEZCY`_iA}L+MZQvGgbT6qg45U;&;i7W{eZ)k#!$Zo3DOdwZ;C26pWu3 zM;}pC@^eXIHL91_uSQMd@rA0!;|o=BEi+V6{~7;(`ID&QxRx@CY=O{HsQoK(y`#R$ z@YD4*r;6)rO;^-WUuynQpLqSFK7zJFwtM(t&^s7o$Eh)+K5+Z_JwW?&p9!>KC@V(` zxuVObCsN{8GYFqY6W#;u_TAuZ5Ph=!DRssy-B&Qr45R;%$z#8*lSjvds&JugA{T=3K+9tl<65i+-1me#d=C;9KYYYKPzZx^~;J~`>dgI_>NS69k5&T`2g zFQrNTl!HI#f)C|VgD(evd=!@&d^z}&PI0MmmV-ZDic9{KgFol+KKboFB%EK$QK@m_ zbxfZBMX7O0ACmc}`H)ne^dYfR{)(HV#@Q`+SiW8IPxm3YNcxaWl|Cesr4LEAQRti# zjY8*4Hwv9I)hKk%@8?5uR=pFPO=In8I)63R&exgZT_f;!y5#RP$=@lGzYx++@|=$i z-#TV+{+5<-{%YWFD)`HNNU&zmzN+_YUoP`&pDp%lpH%s^k1GAz2jzb4y;8q6SnS7V z9E1-^{c=ywH}IDGgu<7E_*)o&4@oh6NJ`;DQVt)IO8Age!G~lqd`OlxY9XA8`;gR2 z9}@7D`;dUI+=m2w!k)b^4=Hr+-E2^aFS{`feYR@8U1kdr3Nf zvCbRjLvl472l@Exko?6g%jZ0-;!fi)_aUJIa zhvXBib3TF($p`Quc@I7$SckYde}D8L`O$~uJA6n+oc#X5QiIY5+Tr^LChq@Xg8zrd zNd6yNd+GZEpfY1|`u7hk-2cNI-oEqH?;p&v+JcwF#D+~h(Oo9&E3#UHGh<@H&&y+* z;Q#SUCeB0s0KOTW`u7i_%jx?EJ2p4kc6Qp`J9?vx@c+Pf7rN-Ys;O@ufWB|w1~Q}g z4njec(pNxnq*oE91bbP81vBo8L<|uF_xqSrZtND{Jg9`vM=^Xp%vvY2!{=j`_IXsw zfgWbXcMi(j5%_A5Ww%e()r#jL#TaMksI+O?yds~;s(Qm#w>Ef!hw7G>+l23QRExOjh92fNvAb$9F9j;JX&|+?xbPwnShe za3nAu7!Py;nc};Kd@@EkseJj{iDLwgOcod~zg3VhS`E@WVhqn;shDI##Tni(Tp+G3 zcEJZJ{y=1whueEhWhvf+EG=yx^QO6^KNRQ9`3I~K^r2-qWk*>#>_D0Zf9Jzy`>kqK zX#UO-?03DUGPi%s`L9`%&wtsPbpCIxiSsgi3GoY` zw<9mt+nG1T>zbG2i)a-+j0_T7B;ep zMU4i-tVaG`O$ux}hu`<^IQ$3i_QS`$TMxhE?L7Rpx8v~Z-tgg9z1K}In7?KEByH@x zEMLmJOy3yPkL*9%oc-z=)NjJbc`CjCGxsp;HLwHF`$QFK*v4}``EXXi?nBT1WP_-{ z8f_e&qsLixs~tr}+b{=OdBw2C%}e8pFWac=CpTEQ6m)sd1lVUFbiffXwS z7SsqVYzPmt;p8#A3-yR%oeuN>bAcLg8ZZqw3Frk*0s4ShzzkpxFcV1kgU(yDSJwMh z)CbpyYb&B_1C~iSS2uY1XL3fChxP#Kg3R+0UNj$6D})VqX^ zi#A)2Ep7i|K^1&P2CRh%eC$(F} z;V;CC71wxg19uAWd^yAioya0Lj)@4jZBF#5`U9{#NQMk@z(`R0kT&Vs|%@s3tz&$1Qr~^IG1~D!Gr{?Gjfm3PZKyVRbZtiaPD}46&`_e#tAHU12L8_XYB=q z^{jn{-fw>(XEG#bypl7T*c#|ZSvi>R%y3vl6FhK zv|H9oyX7)r_cDL5yS1mh`?id=XAk3ViX8KjBV)zx%x^zk3kxzP*cg-;Uwk zx8LL4x8LF2w<86946VX$`90o!%dTm~yKk^l7Qs$g3_B%EpP};QuwT~y-QRu7ohsft z&b>(Rm&UhTX}8RsEci>~TyDB}?>HA)F10I-dAZ}nd&jw{Vr$>Bi*QCNRb-&|(%CZ4ziy1x6_e!Qt<|A^q?D?i)DGzQg`G!Sw&m{%JOyV*fNr z`)9PN&rS7KPqTj_y^7Lx`{xa(+CLMe{c}3(pYx^t(+&HlJ@xabaRcQp#LI_R}O`zvT8)X-nnyQ)%Ng+h9MH-(IzvJ#N?(?DW0Y zAzLS0VPHqK!)|GV9n}VmfgP3Xs+K#GxLq|G_EgV|DL%5Jl1(+%8Us73wd)3~H^$&w zkK%i;@*Ej9`MpyWKAvELp}fxUC=j<}s;rLEO|j=kG?n^V|Y zl@s<641 z1Igyv1SFelJrFk6B$gB$Z5SPnvD-t|aN9*GKbJPyam)hYur0qc5fqYwmBY-~z z+JVXEo#VH|UO6JB$`r(!V+5X{caM#S4X5|0-2|V#_&z_i7wyAC^~rSzu4&X_mGfX! zB@jFqRSE2UkQ~`e`zfajs-;6_i7n#ywYubi)st9+;cd>xQ?Y|WMvTb-? zBHziim*N{*&x^G3i-bRA#SEl9C(@cGi?rscNW-%}m8qX7(w1ir_xmk^O+a}>vCaaj zzyhENSO_!&gFq{gY#lb>H9$LX6VL&qy8e{v?&jzs#5GirEewb!%*a4ZKN<=7w$imC z9oMvs@*(b_IIxuv$MNfHI{h6kd<=#ClTr24x6!D*75H7yF%{e!;uqZ$JTx-pT%*5D z&x40L{W!ACk%PB+rtagtxIwh#Quw{tYofe8iqac_7z+?!0@$GcSe%a);> z?&Iy0<>FDUD|*Q=A6oJ!R+c!qzp=U`Iko_C(-I}N5NKXvjm-mEmnin>K--e9xi77K z$=BT1*0JOSe-de6^xzsWYG%oesO6-c%z z6Oe3EW?%-;3d{uBfOMbjKp)TnlpN*v8}~O^_tRdC=Oa4rQt(UiH<9iErBzDq&6oRd zzxjKHz(l=#sl28#T$AKfnqJ;1_r+u1&gj|0+ILa`UC&|ei>%Im-Ij&8Y4#!Ri)^0# zl8ttc8D z_O6%xZaMnhsebetLw$BB)-<3BbOTL556}$c{R!wo+y>*m2F%`ZHzL!Pl`TOte0`hJn_i0J$mA>cGL<`?}^>EG@v^1 zCYu{*ni#fufaZzYZ7D$OM8)m`+9qCQO9I*_el2{YCrY0FQMTikvK{+nJ3fy(p)TOd zGTGnDBld}Dtqd}``2);9AEzzDc8B>h=y!?+J3I8dIFqj5(e*VQ6US;EwBnocU-~!4 z8mz0hcDGRNdH+LCMEP9kbQymTI^7A|mmX49XG3x%^6^BmxcUSb{JKON%=K9`K z3e)|!Ntf&CJLKtky4Tb53n@c^P5lGPYcbjlD3~z7-@cI)AfY@R$~q4FhqaaC9>%zq<5pPRpPutm`;AaMU-!u^{<@F$4fE@c z8+XWBM}FPVUr3kbe%LhqA;0drGyJ-5 z(dx*r`?+6wdiLDJ%W@5856bR=uc~{$&~TnZ{JA}6_3N%B+Z_D5No$GApFQN)O`3}% z|578C)7U@SK-xspDpR0W^{RL$QfKrGJsl0aW{+5`NwNkH)HqoBQvNH36;tcf>NFt^cN2 z1!(W@j-_?L#v)-W+E_sM4RPo_%Lt+O==YfBb@a77veg4~-2%B^H{#r{yAU?MnhlCF zgYar$A1t8lE9tv{z%{}?m`B^A>3eO!vBEx>OWSec8$>k;ic$#sU(H537O-7W3J6z= z?+ezfQDL{Ue%+g-U-x?H*WD)lx?6|) zb#Ab?c=f-5igTAkl7XyFC|-uu+QZZ!ZtTd8`(HXws)3n?+n@A z-ZR>pYa?i1@9FK$`&{d{Xd{1uHu6VkBcDMV`2^a?N6C8yO)v{f%}!aCQ-E_Y!xjPSZs;YLdHd$zA9q ze5`;@a@ttIbrEov>mtCRv4V5AypY>=_%|&vUkvFYe}XRZN9ZD-K^OT1y2wY+MLvKo z@*Z@N!T$!jh~#dr?tqDn|89vA_~l?f}L4?*%=hj1NIw zQ%&;+LaSqSG6=Y)U~-+wbV_Vl6ih+mOE$C zq{)-r%X22x)SO>Csb*^S@+l1sIm_$nvnNlPAvZr4G|DUlfNdKn4r~R_ltE1K72`fFDEnf#y9+g7p6RW0>3Z8a>j ztzmr|&t0=JP>1A(rd2J~tu^)af#ww~vv&1rmWkVcN%_3Yy49it?lAo+YLi6MGMPj%)azu zIjsH7K;vsP0){*O%at2CPNJQs{PcNAPomv4rRnAM`AO8lqKi!|*1 zITC5n%j@%>j^#4HzF#jdli6r&(6ROX`h2P5GMQ1|uh(Bkth;GQA1bfU&pPV!vpfvH z27Q}w0s>u^etdo2(NUGxPkHHA!2dt_-+9=iEc*I^9QF7wPQY_8`VRY@cD%{hNoDC8 z=%>%0FI+3)tpa!((Wc&hSW5HXbeX1?&%H~e=XMBuL0`y-=o!~P`}FdU4KIIMbd?OUa7LH=OPBw`hlfT}O4M|9W332>K&b3Kw z6WbxQgh+~Qf}e35H&CEZ7j}6qP`0I8c1mAyVixzcEbI`c(pT~Gs7%I z0enLIn{EHI%hAk7<*2Va|6HW+yyJ505kdBCvs3fhqjKksa_7G&GMZ=mH zY0qre9G6}Hv#*W1aIO8M^Pu3RrFz@g|zXL(v8J3IR9VxN*zQ zn;AR#*Ug99^xo$99mykI$wR$of6nrrjW-|cInvbH-QU&6o{BdgIoNyV)GG(Mo`^sa`T~%gI$gnJ`yilcshQlx$9tm^TFgd{-P(@b;I`14bRU;H62YJ>Fwz5 z8kgAj;)k1$HMJjV?rXZM`^X(e52#HGdfU}}IQgb_xI5X@(%aPAhuRWNM^`qjN*r66 zs6Cdbu4(FSYil~x-QDxw;yp(?jyCrtXZttTefDg;cXm*!o)>}Mo{mF@jB)yUy!Yir>-pH}U@7WN%a3-RNLPEA_1Pj+Zg0v#q@L`12w(znO2v?`l4pY(LVC z!KG2E|BrdSmR)TSlSh+XpYDRyTP@O=wNGC1%M7L zc@-50+mEfTN|-C4xu95#WmPPqSNNv&+iS%_s)@w42JeX=SZcMg^G4WQcb`Rtd5C95 ze7z{f{?&-DUl6|x#E#eh7b4?!r@-5d{O8>wTrYqPBA(~35)r>O_{>kt_)Q}ISdPFO zQ9t!3wV>T&6wg#<@+6a|pc>AV2#atyQW=e?lGX-yeEGls^t*+PZ$DGNdMxpkrjz&lbTSut>7b8_gUT9Ro8x4Zh!62b zlV|aWD|}JEhzrwrbLw+7OEVXH)^yvBMC%=i>Mch)+V1M;P2Sj@+>$(exZzND>m9pW zn-3+o?bzM4a#dfltGD||)1l_OyZig94kAbGuI{^TZ0_tnvT4WeMAOQZH*_EBKip-; zG*$2E??K_l-K&~5ZQ6wIjSaO;yFb^|-o9gZb7^ zS(!*4N^#KB=O4F+YY`bd_&f>Tlk z^b!|s_W!2p9iIpB+QhEKwlIZVHQy|Oxwc%NRNdU_de!>ovVBs8^Je3#);KR8RoT>} zcl0#1_II>3C99KdEh}4F+gG==u3Wvc`SQEAUIf2w#MT#Fo;*Kv9Rgpne2Wd>2A`b= z>O~H-JXu6HlkR(WI@sH}3xzB6j9hst_vBELrt z+bp(0|Ab|1K#ge<@>!A3?Uz$a+2D-L#aIX^18v2=@kJITlZ(7Q>*M@|TTU(Uvtq2dkBNTra7CChlJ$20DC`_s8r_jpEb(EJ7d zbcxTGDqK{MaZfB7b7w|JnP=ZS*7Zt?`G9MH0bmU<2uuKxF2_Z>924nsM5N1MB^7l? zQ_B}E&$uS6W41h7+LI|6&CSF{Eg8$G^)c&dD(|#ecs(wOaL+rXsvTxrey_`FO_=h=ntx?g0gq@OmrwZ zfPX{LqUfn;AUYiNM@OQ*=qNg7ec75$OP=U(#Hr;h^TwzzoBX0L8@-m)4QGa94QEcp zTxW)2&NBnC+%rWn^-LgUKjVuzEbbVZpOFHE@+psB%Xz@lxjEn4IWRn;4V)U*28K>) z0|P_aK+%9U5Gc|Hd;#sW+ZVB8{41Oi%le%gvft{?vft_UQg`$ow58@$$hM__>|~b7Hyv=s?nT+0oKl(Y3&w_VeE4P` zWBUpF8M}?}C}VpG?_z8h;W5T`5Z=vLBjFbq+e`@Qz9H-WP-pvK%S+hulq8)$li2Ssu763AXq+?SN{YhuQ>vW(@E)E02sbO@*N-Zxdu$@_ninXoPiL=*v^B$` z%-R=?_&1FBAtiNO5oKzgqqIXh`-w=a*)Ph}K5xYT%!t2DN!=@pGKr@stxacdi?qaE zQKsr?BmNyDKBJ`U;DdBvraf0*|Lu#?^tfw1P@aC(wH9cbe%ZAK=$QV#s}`tEf5%k= zbWT6xssg&FpL8XFp6N$j=|BCIZeF_+ux%QT_~VjvH-3|S@AAXG|94GCNnLFRrByzm z$|W2AL1*55?>x4{!pAT6pemdBzv89Y`1cpfA5mq>bFk5pGV7=K$ODMSwT6wv31K5K zijSTV@vxN$#gna2e0WgA!)DMEZZlqr*X|eb^KEDTxU}u0agmlB0m{>V4E6wR)4vIJ z0Ugut1rGt$>30GEzFg6 zNmVC2xVMX2>?y)WA5v4MU5(!^>?(}8iu!oAnXX-}&fXQ*E{M6yw5!pZjkI4GX_(Kb z%-F3){0Bxn<~P%>hHo^|eru#*&Z9CTHyQDNFyb)>nsznB?FxPPHzN)6B9%D>yCPdZ zf%(xKHw)~k4R#?_kGqZl?=hLDPl; zH1?QZDShvpEzQn16o1Q{EzQn56rY*1rP=w1;s>r^OVk&d=UXbQ0U94DXku($H^%05 zV{BeG#^!ZnY+g6U=5=FiUVqZnPGj@>5hwNiQLG7AE7NR@rL!MmuB*rW^{-M#w!U7+ zNwe(w^&E4p^0^Bi4xUSa6lK_Z+GEnY5!23S{_3%gob#gZbg!n_{?LGQZ{$8EX<7Dl z&W8e1EK`g%KKtIj2dx@BK9)3P^uvSm-y zIiZ5`ou0|yq%!yPMX3&&GpX(JM;hhKfnypT1<&;5zl8bVl{bp-k(f;N?2}LexL3l{ zYv3M0cU`XoB47piq$ldwgP%sS|in25}Qyt;K zM&3Tn$rRS^ZORQL32y61RE zFJtUN>X$dMFvo4Il;Z}pOYy7O#M<@YiP|;ciK?3LL}em8Q63LZL}TFzEfSsxg~PZY2pFMH0yik&ygiAnR_&xK) z#T9vyr^~^YA;CBf*aWx)2BcmG+|@uv&^x<}+(U_sr3at_c}cLuiqrwi`72V@1P*4-~#fwsCqnfEJ_ zz{N*D0x{7)Elir;g_NgA^fTG^8XwEzd)Yz|5NbmLl|n)|F2i1 z^YsXSy#GHmY0KJD@4l7I+Ozg6`2XuM@$QHJAA|KAP&-y114{D0f5|Nmva z-kmhG5%S}?|G(a&U0=smWPM@o|J(C@ofA%Xcro;X#qj?ZlmE|_Y9l`Q|GsjpVU8Ia z`Tw4M-2Y!xx+bbBe(wLn?}u^jBpo1!?$ZLP%Mp<-hb@=+|NcDx z75#s=P2u$vON4vgDdPTr_FBF0_5}C;i`s?%ALvq3iz`0R zGYyr|frfZ=r~&?e1N{F6`2P*?{~Nr9|L=|t*E_XrO^r?uHyw2zt^)*jh&~lIDyhJ_ z+SuZCiI{6$Wz4xQ9?M+^9bjD`mb1hGR{Wc8D|1G3N^3ipfV-u0mdiOy*W!MeeAA9qBZKnL5iZjLOuy}fp$Wj}=b8Zg zHc(9N^K*_4j>@%x%`YJiws&$=UguYZ@}r_~z1WDIs{@;|C0lEnGoTEYfu2w?-l z*Y6PVum$ex!xn_Ef4hi>4ul?k_$F*(J=LSZkEZZ^+c0@Itm2&T^~aa=f#b^4Vea$W zri+%4&+nM_E_oEFPOD4E=XXwLxX;Pl!mwu%l{2yHa_^Xchg}+^Q;pmgt$n(YkMp(x zu?}v-yhFCKZI@UFw;6Ut^Aokf=ObK~t#}?lcDr@2CAEcRFKo_Rs!+MD#`?My_GjMH z=C$2|yK~C}BRY*U&HbA(52RA0Q;b8W7>yi5d>6En_DfPA&CWub7=t!3@=y(2fTs?b=!)f;vB)y6H$<(lVdx9+@VU-_dPrw5&vlgR4I6B9 zjUq|m_tSh$W5?%d*evF?Q-6k zs05Zkdx!%=&?0F4@Sy&IJS%)o+}i~=$o*tX_?`N(Byk<2vs36Ex-PK&6@hI(fT&of2EM zVp?Cd{3SiH;srgi{5idJ#p8PE@(w+;BB_U#Z_uf}122mDZh1nIZzbOqI>jv9HD?bcR{9rOX@!h`-Pki&Q!xN8vH9Vnrh9}zE!xQxlVXAM}gQC8M2PB#6 z7JoCUUibWn8vV;5b@>wmYP$Zfv~=5(T3Y{_mVWFZE&a{2TKc=tGJXIp-^TRdjy7C2X~TeKOvMLmtd?=S9oMD$_f8A+z_n&To|bF(B@(YZi} zSjDk}2pYo8Yz4<#S&ZXN7<-D}hOsByz?O4d&Ff#A7WLKMO7;EeLRH;1WfgE$-6kag zOw?^qDu65MDwJ|ytZuoo0$2+DAqEUVf1rCz!V#hS2=n{Gn=}8TSUeo}+i1eK;(0fSN z*#foV%`M$4Bf_qD}r+&-KEN&1K+9k7pn|?CZULMg`v}wTSa(Ivu)( z&?biH`91A_u=D*7!-Jjb*&$zOc(O)<{$cEEAl>6@R7e#%2(ycQ5Tt+9BRb_U_dTR0 zv0)+g!H=_9OV(oO9{zE$=OF~mqXe3V51L09nulBaz*_R{3=3+|y^XyNLhGnssC9&# z+M?2$*o4OmedAfKZv?O&7Q;Vt&^iWPgZ43)I);OtQ0e?!&@q-wv9xc{KFVB{@u=lN zW8Z>@&#?>lEimj?aIkkTIB35DkoGI^y$U`{QrXTlSCV`U?r(XwItsDix~?&a?Bysze+ zR_w~S#R*a8*om0-sHin0H}&QU@8_6g5e-XnRZJ&$-!D|t%V?y;wp`;;;J zefBZMZXaJHM^k=92D2EP!Cnp~DayEoEzj7k40{>E8pHV!_qE=l3hiPm5ZuaI3nc9# z3PifRQ>4rFB3-T&>2e5|NsX#3n98**$+)mj!j8QQN`~=05l%z1P@!4K8UJX6oyA_1 z2Pg3u=w8w=@>B)&?|H}Tw5NLc`a1b$fb@zIZ}c%K7Rd&koPV8T!N}KGNyKYtYjjNy z7<(Mj%mem1;#rgGa&FJW-vBop87lUlI?lOY zx$#U1-Y4gJsKeAO%(+_Pe9t1k;f%s370;8d_H2ka6%>D2UGCdKzcFJ#=l<~p75jR; z1zIexARO})gkp;dN@DJUV9Zre6mu2?V)+HWSOL~EFZN9MfO)`tpaMJagS!5KEaFBfIvqPM_POEykH+8#(O(qK4WVMy^NI;<}tR15KGub zLOcSjB6Q=g9n5i|y&?|qDAI~lpaOIPWuObl`Bag?rve6_@)>*zYlD5UrrYyFx@~b# zx8(q!yrrdmKhx5IpK9sgPqcJs2G4(9 z;e08qQD6V*jc`ctrEpO2r7+Hcp7Vrp4)mNSjB_Bq6m|-}6m|%{bjyqMoaFZxcRo$) zES_J#B>2s)=Op>_)OYYDII6%7b~DFj8f=7bWxF}vM1mK>24@6c;+#Yi5Wfj8AXrBJ^b9~b>1 zZb{+VyG6LN<8q#tI4r`^TSaVkT7w7@{fL@><7zPG_ zAz%@(2p9wgfF(fA!KMrjHf3>2Y2=U_n2!GN5D0cSZ_yTQTQ=W(#4!NHRAI9Qv(!P@3=uvUYEwHh3( z-QZyD1_w(T94u*Yur`B(wHX|&)!<;Q{}(vec7ua$GdS2*gM)1{IM_ymgKaQ4SiQl) zcGlgm{D1lEOq$#Oy3yc0)XD?y(IV_%UQtx&du?^JdfSre*y5n&ES82@INp3pNI25vG27gl3|V7 zqRaNa;#n1*W04Y9f7#yGo7h0WrrG_fbV6^4%qk}*6hGDIwl>a=BneW zqNEi$0$wSnJX2~WAAC{`BOd)$^VO8Boh2gOS|rk~evxkV0>LZ&tUR^YQjy7VmVj5rz$@dUvcWBLZFXKqL?Ybtj=COP zRPu9PnOmQm$~JzFbI0`N&C#;L#@MoLfzE-zX3e!Fplg4=wQ~>FAT4L5yl2aQ)x%}j zONai1BiJ)n7UI7>VS6f55{i13gkr78>&bB`14g*T2oD+IW+Qyc2p=%Q!$!Et2#*-y z{YH4Sq$Ij%NicRh!fw zz?;_tVTU)b0m2S%CcA_k-b{9hcyFVK_ih&P-iDmnF*Y^jJz(9Yp1eDJ`TC8q*mabR zoD1Vz);}dlg!NAkn`5!Q-$+|?d|0NmwP}%tu~^e*q}B9`wA#Bx8pfig*GNko7HL&? z4r9E)V~leqJxInm@50#J@`4x}+6PF_*TPS!*xvHto3IbQcPD{%cVPV|+?n&LX&=%N zv_Wx=%BwQrb|Ve;0fs2jHXCVMjWpPY)MKPI7-<`kW@^k5`al|M;;CIh(vG#K8qt^3 zInraRo>GHpCWo@J%zdu=z~!^d#{I4ViJrGfGm^!P+q?}77+h`M^|d{x2Klu{oe_S1 zsdKcS4A1<)j=|^V<#o)<8#VIMp0iryeN5UJxsLmIPsr&VEJ{8YsYYlVKI5HA{&x6f z*l)*BXOU4S?Nd{qlX&|K?Z=*iT!nL@_%%;q{~GcJ=e6&g*M7OtKH2D6wC~vI(aA?} zkWUEyPWrfymCA+=?&q3CB4PMRue{+&NX&w^VCe8UEFg)F8Ml!iT8WqL8L_v0^34Qn zlQ?GrSC}|w0=ELyS5N>M9(^pCcRucU=M$fIKCoeaKB0N%Od<*BX_-&*;aj}4N(I+n!fK#5_Aw|h|_L8fDeT^PEKa57UvXY-WJ^%1j zigQO5IQJapvqV+etA{#`=lD-t^nvrzo-iMHYX)^vIjU3YJT(L#5_N`IHXi47I;hV0 z{5nfnz6G1SBwi=~`x)BbCWBj3oieyFv;@>q#Iog#*Fk>12lhhO#KQYkh`)JPF}}PG zCyj#zbtvHM7Sy2>OA?jw?tn&seVZx#X$I|YWH3VSjZ7zfL1jNYK3kvkv?Y!Eg7&pp z>MXLFE*hmiDd7IpCk4Dcvt5nmtN6Po*r~(lpEp<3WrIJ66PoLvRLH14X4JQoePxok zw99G3{8;-7=CGyY&H>mv&Ktf2KX3Q~Fd>|`7guC(W->UN72TNartK<(gjBu1B6 zKpPC5K01t!KDh$m>x&N-U;YUh2{1bTIcpX8zdp=%)2|*Q0prI%=go{pcotgey+$G9 zeG{RTweiY$<-v|Vw!NbZuVGpf+P;0G5lioB3VlrWP*m^h>*-x{{q-oT_qW7byANN# ztGO-N-Nj#*)P1<8zb|?Hp+iRxSM(g|?o77!g^qN$R@B5-#VbRJs;cVhPmZA_^!Bp2cuiNlwO5qD z6{dGXFnZTiUvmpkM~&i_z^nef{%7yLpw~ov_U?<%-hJ`$FN2%+PK&GFOY`!5(z`F3 zHr~2p!;RZ+Fz3a8FVehf$AM{%fAD7sm!bLIKM_m9=H(wMOG}wq-dy)gbWj8M(7H!I z!FBk@To3U~fc-9;?KaDs>z;`_jr?Z1S>A}o-egg?nQpF=CLUN&-t51LOOZ}XuUXz) zmrXR+WuqE?4(4w{PzoMrmOqb&8}u5IOD@ literal 0 HcmV?d00001 diff --git a/example/65_gemm_multiply_multiply/hsa/moe_bs_stage2_v3_128x128x128.co b/example/65_gemm_multiply_multiply/hsa/moe_bs_stage2_v3_128x128x128.co new file mode 100755 index 0000000000000000000000000000000000000000..128657224d02ee03f42dda36feb6e2fc308dba86 GIT binary patch literal 23312 zcmeHv4Rlo1x%QqjXC{-$%F(@Ti?2$vv~8Iv-i9I-@W&{_xp1CN-9ewgTZ5D;%8)Mm_d6c@MzEWo5Vwvi&!psPkK^_r(1%9DrrlUpUqhLz+b-%+_5u+mt<3CD=DlitsUm2TlDefv2?>df7 zM|It`P>a9yQtTA3tX|O`MwLhD>@_0X9=H@W?u&h?eNFW
1f|3v7ctHZdhB_+}6-jcUwo- zEm{qzOap4$-muzxGG%p#x300fuDb`h<fCRyTIEolN^iWo6#d^kQajx3{~lX&tKA zvXbhy@|M@otrv><<*6Twx6$SNDD$?4HQwf~4sDB9W&H^ z<3V$Ov*NzdsC{|msMTIEYX79oyB^F3Z_hrwtyh?qB$1x^J_Hk#S6wm$3Y!3tzG2H_qp+#}f8T?ps-%oPEN6gTUgnTHP>+TO0Q{QC<-EgOGP z@%zyFc^T|;TC0fk^^xx%c-Mb=8|YmxL*EZM&sE1$pYW`1Syj%;TXOPl=xS-Yt)<($ zpu>BEclGL`wvLszELz#n<}I&URF^%o$J^fB(N)*hu&$%GCwCP%@@qP7ThP$j(N$8l zD5owvdtOId@9K6vq%Ntdh!>oV;21n@yCHf3?4yjsDEc z(f{NUKN?(@zsP%QueW_AT3Fdaxp1CrGzzh$XrCy7mq&q%%IZ)urPgz8q`0h-MK{zT z?rWn(HdN%rOtqq@+)>5x%z5=F4Swj1x|~^jAgL#+TROdME$!Y#D^+h(Z=1J^I)hkp zuV7gsHkJ}(iCSXNF8nLWtGWqmZwdQG!iBZs>Wmi_v2Vym8P?UQEuD2Mds~|7vh#9g zHLqORII}4`x2Yk&S)UsJG5Q3{UX61&?(37xLe#s`k61q@m#*(HA7izcbb5N<%&$*h zfT;J8{?1Y9N6+1_PkFT%-$BN{rEmM-TN(KOQU+p_VLF5L4G$e$VUUJtr7+$Vw}sy0 zOvZzIBJMnBWDgu*@z!BGMUJr!*Aq`@+*UuUb|eIXTf)_r)C~xuG^J*X?}muTKwKz( zYaHT=&1PRoL_}bdB=U~4$m(Gxo{8TYwgkiBFf+*mcUWrWgNJ+rceYvvgI34Dfx9CH z4l<*8co*_!l6lxcXf_XL0enVj*aIAvrQtFli?a@I1^5uZl-?%}9EbAr>EX!vAKe5jiaB{xDGOV>MjSb??uqcxhVoK^J zCOM9nM18Hya$GWvG4yxUx)Qpr>EW97sh`kVJT|d4PLb0B%0w5A$;VBRN%B33wk3BZ zwz;S*pTGJPZVF;-(Y8uS%Y)jk$Whyp4w?LK*v6+a-||yd)@NqE>QfTy8|MQeO*|86 zQ+JRUol zu?(OcmQ`Fqi_?ka?@c=zqXsv~2TOqkM(t zh=9*$_Qf6=6BrMK38@D%HE2sI; zKd~{f+iKX>1B=+b2cp=02Ld_uu0U3!E0Edb3YeQ+Ng{8F{DKBP--y4fA578{7&VvyvG%wBs&l8nB%?)LPt-5b(lyF1gV&Z)_wuC4@uQ?$0Ej5qk2O4M{u zNo#sR$ur=sJ(IOtMmuZIWIF}sxCG8jm3`Ml?P;CcvAZ?@j`mj5t(~p8klkF!ZXPfX zI19KKSOct$EI0M<-tD>u&(}nLD(3_9BTc57rvmEjXt$5rjXN@hg-2>;RGQXnVG7Q1 zg$OI`Si+Q!i7L$tr!jHkZsYikGzU_RaT{I6gpE&letbeolI)Fy@sgeLQ_N!RWNZ-q z`sK-KjIAKdWo#K?9%EQmzB~#2af$IYqu<*do8HuIPxo{y=>^@kbiYMT8(fp%@=0%O zjO>kTwe-feM)cZS&AqnPJ7pzRij-SrtV>d2hTI<)=jxA*b@kiru6~=%6@fft$ms-& z#HPhps?)}Hsnf>vsMDg?sB>d%tr1byR{3^I+8p$Q8U4|3UGEB8HmGbmW8w81RUf0f z&%cjIF@Hvf-+!RV_~ynY<0;U4pfjKkLFYi{LB9w60rUyzGtd{Hzk>c9^d(5{YciTa z7LXNW2gQQoKnb8k(0I^9P%_8?a)Ml-RM0Px$FD$dfsTXT2K@`@H=uVwXFm0_A{mL9;;lpld;MKyyJJP!XsYRND8Hv3R4=Shg{|p}K{g z4TlSF4XEJ_RW0lsR_P6uE$sJ%Z7u8*fN|)c#Qt)k>SvO~>|dTtW$XoduRdgzN%r@w zkbF8%#ZM^g%agMi!?nR*PE_s_JWnmV$W!#B;JKEuXNafxX~DC*<|0q&KM9`e7~4%e zWk0sc5zzgKBv4cJ-V;*!eUp9#l*{)|`X$g@{`#ahftKZZ!Bbf%czBz&J}Npbcuoump5hM#5869k@AuLS z!E-zyc*?Rb9q(q0n{?!%Nh(l2^4O$Sp!vu%liGlmBQH$q0$Pu}IcYu6cI5mdA5b|W zB?p1_Bjb~)-qcpTys8vQBE5#zG2S*e%J=Z~ESBD>U4*(XLGML*BD{FE;KzD5 zk;19|gqPR*CG@p)q(*rGC?8p->;;;SG%C*nEk|0F=YZBDUCJ{++mZFkE}(KGpgab& z9~o4rPFPDz*FqMyV;y#tNOso^#a7q+wkX$))lsbeu@=_2tA#bSx3K0a)xgI2E%xd& zBTvdc!nd9f_|H!XoQl0?oK(|CaZfiIA^*85`-0}WM`izhajvgG|FCL4)(Op{ znSa4PuFv&l_ah8E7;D~V%XbK#@z~>ao*L>W@X%OOKD8t-$j5>GU*}nJkKn=hBooil zdoLaPCiI2mzAw5RD7&wV=>(eHGh=#y7WcH6H9)I-O3YfI&HYjI2B6~pWAsL#-Tk*{ zDsL|I41MfZWJqF;V?Qf|B3`g*x>|U9sXAp{nM(U&!n*hPSk^-ZS-t23eZDVCvGVy& zd*PHJ##|kxe6BC2{sAxbg+4cGAS?Qw>lzSqxR&|@d|dXmxxNH_LUWzx84!F+F^@*C zcLB6TDj6Ts50pz{V+Mfc5^KyL&{87B3<0ere^s^sZ6%*6TY*Z+d1V{WUh<(rWukwb z`rM{I@X_3c-onS1+SK(|wUv0@hE78~yp7;N8z0pBgW53)sIKF0?n(Rjcx;4R=*hC1+=a{8oeH9Tm7%m zKA^JNs04xb)e{sdkGD-*!&l74MmcJ3*I$qIl?h#i%3qcp#m9Gfo(L!1g~DsxBD^+3 zgp)2q;Z*;xs`qx3FD*QwJOGpz9#tL$nisyVJOs2X+^_5eS{J^kJOZ>Wd{)fug`33O zUbq>Wnb4DHZj-LnSZ?IHR^tMr-9~d5`XS+4Qw08V3e*c!`$T}B^te?J>w z{_qCqI}YeOD^{p{$*`nc}CkS-Xui)0Yb#TpsmHb z!!<8Y?PNF(OV7DCDBAuD9V1-5cWOPt`=uAA-o;*Zy#swPK{3fo{WV*{>;ykRu; zjT$-_i}9&?(kOLCK!>I0hcWh$gB*Ik-zdFFx}>r?BJ|vkrqCFY&JuogIMo-!Sm5Q5 zE0!I8YspJSwu|jlDX;3M=gYfZI+!-hUR5c->PI)r!!JGM4l|}wohaYx9Y^KxORuNV zdo}z1@xXINwq^U{?xF3Ex(Bz1+ymPmb_cir(A~d%hdZ$S2W~2t^rCtu9skXXn@q=1 zC&Sq5O*WK$({>{Z?R>;ZWmnuYU$#?y4oEEY>H(aA5q9ThS)s7q;A4`r?={i3iaU?OQj!mvJC|p z?gN&wKHMK*zkCww0p6=w-(h`D##&zu##&wp#+rW`jFn#w#!4>*W7!MA*w($l*LYiI zo2cBP-udz{Z_8|x-TGY6Zh0nXH}47B<=sKM^hD6k9t+xA9|<1dY1}5NSJCRt@+&-z z+oV_@3M!TdgNpfqpdxP%D$=%~!nOvL)-A!8c^ZRE^)B)rm0#d#46?^M81z^Mf*x}) z=#l$_9w`v?FkjHqdOk?)AzhW~R=jS$Oyw2dx|!!u+;Wt!5u~?LeChW@eCc8lU%EiV zlO9X)Wep;}>}C;Pw(zLT?;{?poJ#j3wDy}oE1Cn6Ez)a{}vA`_oT{h&C1N}z<=0g9m z0B1q}u>$j<|H!~=q5qhHbD;k);9TfG63_$vCkyAj-9KVDw;g_~wvuG@z{#KE#OWYh zM&ZCEjMWq3PQ)aJ8DXqead7 zfmhACzd`jdHsU*LFoZU2bcWWhcZPb_I76LX&QM#sGo-dULru-jP<j48E`QGxF8|tq%fG?r^01HITDn1EMTBc5R!rC*M^7xhS(2s`eh>O9UkmWOP?FLpY%#(P4IRv;=WS9HN=N$<-_p+*uV>SFmPYT@ zA2MFw#&?*D-ph_-%>vfHNn7KH&+{5C?(#pFkr{`*FaLv#=>n&v3v{OmoSG^y#U*fx zQ{ZF=5Nq0NtS^gjnDrH2lsm{_l_sARntYaO@>%rF<&&=k^^N?llIB1*=0FzBfkuga zGub`FzKQG>`nN)Mw?KA>AiIN*-2upM5VG5UL3Xo91|hpSgpl1_LdfncLdb4DA!PSj zLdfnMLdfo1!cnqYrpa#EDA_I3WVdXT?3QV=TQ*8|%QV?7gY1&*m1(k9rpaEJCVOR? z?3HPT=pOX(!b$Om+(_IjpN76&G7>^mE%d4!f}L6;rI(Snd2ec0V7@W zE1K-Rpvm4dn(RHM$=*Yn>}}IzZ%~uH^|(9!`^b!fNnG|b3X-_&Pb-M$vhOY!!)1SJ zK{S{Bl!8bu`%^U8pA6amN6790)`zJ8I~wHo71o!1iTqB~Ueme>iFRT)v=>ob^M@2 zb=+U5W>Np1UB^PpS2#mUmpMbVOPrzV8fU1o${8xFaE6LYogq&VUwg9~I<0^4B)G{7V~M{^j+qWRk<>q_+T3X68?!^mv~t(KS8fb|0I>@yFy=iT9^j z630hzr%&%YSUksj*;tNu!&isG*I^uhrCkieJmSl5B_Ev;UvVRB0>G?g4A)pAzM>k2 zVIT3ORX_&>dWy(BbHwMk7H&p=ANfwfX?YTGQo%`i0&qgXujM3QQo)Ek4mhshm>ds` zFZhK#7C5%xh&%>3rr;3H|M?$$CmCxfaMDD96UGZnN)$LQL128Gz_GCc$Jl|8*_ToRr{l}ReLB;wLhG#+JBg#+IP5B`wvo7`~8zuzQ>%M%|dhLIz#!_Izzd$oT027 zXJ~q+GnAI@47pOBA%~O8@8rU2tKU)O^1CWs{7R|AR+=KDn06%5Io5}wRxR88@f#uA|v5*-!&c-+(JT-@5m`8kx z>F_`Ux}4;vGUBt3$N2_`6BUMi#AmTG6b!kQm=EJ00~w4dcvy}GMi=amqkvHb_sfyM z$bx%u=koWFh=O}$6VOy}w`>F&3-07H8>0vrjJ64kvI>l}2#hcbG|2*uQk>6#^o9aI zPj6rdXMWr{{uh5=Y5z{(_U{DPzu&Wv{d@aY*uUdp|CStN;|@z#+P@QEA1B-R<@WD{ z|AhVfpyeN6|33P4_V1q5ekl|7Z!7HIYhnLBpK_4dQg^`yUUZ55``l#R{;k`t`+KTf z<9auyhtJ=^?ce4xiLD8WjoZJmzaKA%jEDW(3j6m8SIDMZw0}ogE6Ij#h7Fz8j0^T} ztFV7#R&o0`?BFIJ)}w#4{oD0lw0{?suerkhoeCMeF7ju>{%zIl-B#WHZ58(Lud#op zVeKRPw;M?I?{py9zo!95+rJ|;`?u-;mi-%g=q2{=JHDCyo9y0X|26^1{>^RaKnd^v z2>W;1jXB)@?Y=QrO}}xLI_<{k!v38j3H$dTknG<^@9Ae{@WB54p1|xg0yk@!h*aFE(_I735)TmKY_N@R24h>u{$12-PcQ6M(mlPl^y_=g!v4KB&gGNx`z*J? z_I(>{-?u5P#@opLZ60O+cFPxR--^QR-$ta@?cb?md(_l1Yt+=3wQ6egdeszRZ8f3~ zy4M1uX$B2dw}#fto>WpmtCvs0*|Kv=Q_m=poQf&?BH-pvOQ@fOdoSfSv(8 z2YMc~7xV(?CD6-#MaIJ$!-vsF4#sp_R1VIU+C4YF`_F}~g5ZZ28z96|!-gj~QcL16qzn#T*7&kByD_InZ`&Ld+XL<=B*% zH-YwJ(_*Mh^z-{Ed}!Sh~);OU3WY;;>6!}v*go0A^| z%6UIbegJ6Bdp3DH(2}=5c^lB0cQkn`(3baZ@)n?y_fazUZOHo@w{=rnb^BY6b`I5T z-MM7z^IWv;4yFkE){t9-V=d$MTxts+Gin3)v9@vhEY+Wn(Pi}}yQ`EpGluMRa^7{^ z9&65<$L+J0y!qT-Yt35}L-ssd-gmhjSIJu$Lv}uU-Yqdy2G-3+vWNY4zbWed{drLz z>^DSxwEwh~Eq$eh)&CT>ZmET}zjwjTefkz**FM!M(BC4kLnS-+uQrmMdsVaX!n|~% zFGky=UDOwt-ydUt*5~dL^at@^T)FLgDfJ8H_s7_~b)G63XYgQLd0$miA7OrfjD20_ zDLo>1Fc!Su%6@U_IFo%)GIewNplrIG+Xv02P0AApx0voz_5iJ>$GMMy&GaJo9Z*cK zE96sPH;s*^`Ta5UhjX_Ms5BSn^+_gu%$>N8AU_r#?!}8&Fjm|*D0sB_>cZGvutRHp z8cX2Ap!&YJo8x)f9M99{c%C-L^RzjhhdGWi(bpAv znbZeln?9@AxOtm9xeaoKUVmyc@qDP;rm2nK!Pqa;=QOpEc+O~caNa)fpnWxZf0try zAfER$J9ycSOWXDsjkRI3vKuHHR&sw2v*EkK55%yD`-fN!^OctoZ!-uV9mQ}R_u;V{ z;KP9Oc-yo!#Q8IEHrMC0>k#VA_?&i9-!A!yX0zTW!bv}&@Tw<8c=gjFob(k6r~30T zxvbv5qEqG=0<@ieGx|3`<@9UO?*Q$m zUx?OqA)3>qS2h1y=vB=l61Q=m{;jZWpL$=Q{{w*?A3;6QY~8C)kFswsSt9J%wY36k zY6MnSU$ARO!mb_eH}ienhub331B)>?bi1~;zo&71^&IqYJ0EYE&Urna4X2pg2ctA} z+24yej5CjG(&I4Cjh~1(j0umc)#9$OnPZL`KL@Xa%UvlUWQ(VHNzV^qdKvwr!t}gl zJuHjoQKZ?=c^-#H#p!uGF)B>YBTfsWzL2m6_&kQa=-|D*H)4&zzAHfwkosh`M}Nm2 z&v}hs$lVl%bwuKOIMN$_BX{Xx*sm|#sd>2k&82l5^2YBz|3u|V=ud<})_B<4qr$M> z^RVrDST%3sEN%Veajrrw55%F3JnZZ!ezcK?73*R3JdZAIt>SUHdLD?=^N>e{>3O*H zJUV$Ef7R9kk-yH5I6aTJQDJ%>xmq3u$S=()(kTf^mn7S74s0HTxr-~f4LRo?x&y}h zy@PTi#osAOgt>PQn%1J9Yw&#T4oMUdN&A&GO+< zag8F*M2I-kpG4ds8M;hDy~b!nFtZb+eA9Hgm==dn?{u%i?Cq4XwyK_e7S#AoRi0iHVq_gmY*0EM{Nnl zZ{}@6oat>57u4JINwjFwXJh7@&W?(sHhmH;+Vt6&D3dJe&C^kvK8Y4>`fSWmIc`)O zwFPm!O^7pv&qs5An1plj?%(d;7ZsMqPpxMVZt&4rfsoD!{pTjZFGg|=;r=R48HpI} z7n5+#puWF91A3DidQ(c&ecb;g$sCFsHzh13Pvw3uwz0SlfbU2m`Mua%wu`pBr?>O- zpNV#UacI72S7MBNnAuu}6OC!Lqxh(g13wde+<$1k9M=3}!ih<4s`DWEg~Y)xCiQ*y zQ0i&-VCuW>E%1N2XHtQAAoZj>n7ZHn3F`Kzu_@;M)ERE>_o5hCAa$CX`^BWf-^D(5 zO2`&7h5Nr?29f^@_jy6Rhq(WS+dRnqHr(dG#E&lex}gXPc3Y_HEI9T83EPgJgRcWPP9MYu?p-S+?>zPBw4h zb)0Oznh#5{=-UAr1CpoDAxDH?#7CJ#Ta&c5CYi71*D}D%G{`|-rs1moEb^%Fbn@}s zEW1S5IjRTA?&l+rSrPUjg*6i2k~~G$scbb&Vv2GWSj*9KnKo(+@}TSEAG>R^A-1L!TE~& zwBUTjeOhq7;yx`nUvZxnoUgb~3(i;M)6$4@4*9e+;haN0EzLORkWY(>bB+vb#W{z3 zTH0{VA)l6ZoO2{#C(b!!&;R3(7;DCR&Aw-2RABAC$733SOZMFt(*#_)Z*$BN;Ie(2 zVwM7z@4G#w2DoD1>q;%Ke&34<_lfzPkxj>YQO4s+Ca^f;J|zuUnz0F&PJ2(3W!$c~ zfE5|tN-D52W2NE%R%MJ;oWSagC}kor$K7p92IjhNwU(GssubQ?09yM#m-D=jfJJqbIezltHH}J(_ z&_O`xIvneNK|jiM3@8+4g|>4uQF3+rxVn`P+pae8?Hv{(v*I zb(=FZw8a@37<7jEgU*mI;QZ{-T~@zuugl*LpPm8u^bEnLXDfVqw!^3ALHP9Se9XoD zd3I|4JP&IAJli#Yo~@cc&yeQNGobnN^lSb+J_)ug-$4(x3qCki9J37zIJym$948y* zb4)T+a2#VO=NM@y<7hC@+2aP-+o>+VK1nJe?527EZJ=t@89Q67859VGc zaL#Oj*IpwqKTqJSnF4dO1?FS{v9@m|pB~LGV#{TIJ)ch3WG_vVJ(nhXjw@tux+Z&R zn(T4Evrmxxs(wB1!zb-DeA3>9PufZNr0s`K+6?%lO@mL`)CQHy%h_-JddR2jTfZJ5 zx0m~*K}Uf!3cnucDVO;5d^%B+U(K)QQ_ZjEQ~MS2%l&#FuiUQ($o+bNSNH3AAAUWj z;n(vn{CZBpuV+8}dS<|{XBzx^rheO0J}ndZQA;4(biw?*8};zx?j)x@as7Zzn*vD*K-nnJ^SI;GXs7-)8N-L^;^H5 zZ~c1WxNh-vem!Q$7yoGwQQ9k(TmCa?=z@b2{j37mpCs|{=P$z@^t4Ay5Ao)|vllYf z0(4kO-^p`mUkGz#=VxYRX02-JVU;cI_^jWxj>^hnEtJ0d=lB|SN1EEx)7gFPj2TF) z_BLj&>{vacrlHB((at}=*Ri^@x5qo9t!>TfjLxo(R`1FlM_0$nj9Hm8GqW5yxw(0H zj^>=)hK8Aqxec?kn`e2wIn67Zvh!x;%x-MRYRsM8+>n=>*OcEhtI@4j{i~4o1H8y~ zY2SO)d7$|JKlObxmf7CX<7JunfFC~n$1+#7_hzaM-73@mQg}}n%k1iCYUpWTnLXaM zJ)E09_lIC_TUSS2XG2p{OZzI8+1TC9GR0>ZGgo$t6c_>e3>c$N3-&ZL0#)QFo&;X~ z(^21k=ZroH_U(7h{%3vXtgiUxs=@{3^YnG$zl*f4&{C`|zgK)$RsW8vUXH#V>+h77 z)_6TV)7NPY_h^i??o(M%Tli1k4`T54c{tpmr`Okg9bseOKRsSguLYC8H~Gk3C|}-&<9`o5%P!`q-g6HWzDyY%!PZ9~(spO{AdJfoiJaR!Le z>)m$Y`@q`9xvP}8M*WC+)DQOoJT0|>e)_(le~-6Q1oJ+kN4@{-TKHToOiyp`5)u0M zgFk`wa`f-A;u-F1c%B#12L!jC{&N1y-*IJiMW0d*dU}!D#lM%QZ$K1F zTo$gU*Rd7x`q+(TeS<27-!T76KGT9v?>}9Bsk`*HYpfgw7KyjZ$6qg3=iPvZuS##L N6&!!Rta6v9|38ryAJzZ> literal 0 HcmV?d00001 diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 0c2dc799ab..fae4178941 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -169,6 +169,9 @@ // operations #define CK_USE_PK4_LAYOUT_SHUFFLE 1 +// using .co compiled shader for moe_stage2_blockscale +#define CK_USE_ASM_MOE_STAGE2_BLOCKSCALE 1 + // block synchronization only s_wait lgkmcnt(0), not vmcnt(0) #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp index f707f1600b..a8466a311b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp @@ -5,6 +5,7 @@ #include #include +#include #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" @@ -180,6 +181,94 @@ struct DeviceMoeGemmBlockScale const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split); + #if CK_USE_ASM_MOE_STAGE2_BLOCKSCALE + const auto RunKernel = [&](const auto& hsa, const auto& kernel_name) { + // printf("Loading hip kernel\n"); + #ifndef MOE_STAGE2_ASM_DIR + printf("Failed to get moe_asm_dir.\n"); + return; + #endif + hipModule_t module; + hipFunction_t kernel_func; + auto status = hipModuleLoad(&module, (std::string(MOE_STAGE2_ASM_DIR) + hsa).c_str()); + if(status != hipSuccess) + { + printf("Failed to load module (%s): %s.\n", hsa.c_str(), hipGetErrorString(status)); + return; + } + status = hipModuleGetFunction(&kernel_func, module, kernel_name.c_str()); + if(hipSuccess != status) + { + printf("Failed to get function (%s): %s.\n", kernel_name.c_str(), hipGetErrorString(status)); + return; + } + auto arg_size = sizeof(arg); + auto arg_ptr = arg; + // // RunKernel(impl_ptr); + void* config[] = {reinterpret_cast(0x1), + reinterpret_cast(&arg_ptr), + reinterpret_cast(0x2), + &arg_size, + reinterpret_cast(0x3)}; + + if(stream_config.time_kernel_) + { + // time kernel + hipEvent_t start, stop; + hip_check_error(hipEventCreate(&start)); + hip_check_error(hipEventCreate(&stop)); + + hip_check_error(hipDeviceSynchronize()); + hip_check_error(hipEventRecord(start, stream_config.stream_id_)); + + status = hipModuleLaunchKernel(kernel_func, + gdx, + gdy, + 1, + 256, + 1, + 1, + 0, + stream_config.stream_id_, + nullptr, + reinterpret_cast(&config)); + if(hipSuccess != status) + { + printf("Failed to Luach Kernel: %s\n", hipGetErrorString(status)); + return; + } + hip_check_error(hipEventRecord(stop, stream_config.stream_id_)); + hip_check_error(hipEventSynchronize(stop)); + + float total_time = 0; + + hip_check_error(hipEventElapsedTime(&total_time, start, stop)); + + hip_check_error(hipEventDestroy(start)); + hip_check_error(hipEventDestroy(stop)); + ave_time = total_time; + } + else{ + status = hipModuleLaunchKernel(kernel_func, + gdx, + gdy, + 1, + 256, + 1, + 1, + 0, + stream_config.stream_id_, + nullptr, + reinterpret_cast(&config)); + if(hipSuccess != status) + { + printf("Failed to Luach Kernel: %s\n", hipGetErrorString(status)); + return; + } + } + + }; + #else const auto RunKernel = [&](const auto& kernel) { if(stream_config.flush_cache) { @@ -243,6 +332,7 @@ struct DeviceMoeGemmBlockScale stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg); } }; + #endif constexpr auto estimated_reg_a = MPerBlock * KPerBlock * sizeof(ADataType) / BlockSize / 4 * (1 + GridwiseGemm::NWave); @@ -257,6 +347,66 @@ struct DeviceMoeGemmBlockScale constexpr auto MemoryDataOp = IsInputGemm ? InMemoryDataOperationEnum::Set : InMemoryDataOperationEnum::AtomicAdd; + + #if CK_USE_ASM_MOE_STAGE2_BLOCKSCALE + (void)minimum_occupancy; + (void)MemoryDataOp; + //get .co file name for ASM. select by version and shape. + std::string hsa_name = ""; + if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1) + { + if constexpr(MPerBlock == 32) + { + hsa_name = std::string("moe_bs_stage2_v1_32x128x256"); + } + else if constexpr(MPerBlock == 128){ + hsa_name = std::string("moe_bs_stage2_v1_128x128x128"); + } + else{ + printf("Faild: only support 32x128x256 or 128x128x1288.\n"); + } + } + else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3) + { + if constexpr(MPerBlock == 128) + { + hsa_name = std::string("moe_bs_stage2_v3_128x128x128"); + } + // else if constexpr(MPerBlock == 32){ + // hsa_name = std::string("moe_bs_stage2_v3_32x128x256"); + // } + else{ + printf("Faild: v3 only support 128x128x1288.\n"); + } + } + else{ + printf("Faild: only support v1 or v3.\n"); + } + //launch kernel + if(has_main_k_block_loop) + { + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + RunKernel(hsa_name+".co", hsa_name+"_odd_loop"); + } + else + { + RunKernel(hsa_name+".co", hsa_name+"_even_loop"); + } + } + else + { + // Tail number always 1 + if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + { + RunKernel(hsa_name+".co", hsa_name+"_odd_noloop"); + } + else + { + RunKernel(hsa_name+".co", hsa_name+"_even_noloop"); + } + } + #else if(has_main_k_block_loop) { // Tail number always full @@ -341,31 +491,32 @@ struct DeviceMoeGemmBlockScale RunKernel(kernel); } } - else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 || - BlkGemmPipelineVer == BlockGemmPipelineVersion::v3) - { - if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) - { - const auto kernel = kernel_moe_gemm_2lds; - RunKernel(kernel); - } - else - { - const auto kernel = kernel_moe_gemm_2lds; - RunKernel(kernel); - } - } + // else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 || + // BlkGemmPipelineVer == BlockGemmPipelineVersion::v3) + // { + // if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd) + // { + // const auto kernel = kernel_moe_gemm_2lds; + // RunKernel(kernel); + // } + // else + // { + // const auto kernel = kernel_moe_gemm_2lds; + // RunKernel(kernel); + // } + // } } +#endif #endif return ave_time;