From c989bbe3aa87a53f2f9b3859e59da7915ff91e7e Mon Sep 17 00:00:00 2001 From: OscarXu Date: Wed, 7 May 2025 14:24:30 +0800 Subject: [PATCH] Update v1_128x128x128 to 2x2 instead of 4x1 --- .../hsa/moe_bs_stage2_v1_128x128x128.co | Bin 36416 -> 36944 bytes .../moe_gemm2_xdl_fp8_blockscale.cpp | 34 ++++++++++++------ .../impl/device_moe_gemm_blockscale.hpp | 32 ++++------------- 3 files changed, 29 insertions(+), 37 deletions(-) diff --git a/example/65_gemm_multiply_multiply/hsa/moe_bs_stage2_v1_128x128x128.co b/example/65_gemm_multiply_multiply/hsa/moe_bs_stage2_v1_128x128x128.co index 1e6fea5a85407d2e8d86877f68c926706276d2c4..8d740f3e94f711220ed5d8fc71ace364dc9b8221 100755 GIT binary patch literal 36944 zcmeIb3w%`7y*9jNXU}A2XEK>g666x%WEjFELQF!434#eRU=l+N31C1VBqU+a7(z(G zRW36_h=_ogLhC(vEr=J?+EZ%_0gY8_{RG?dinSiK+Ins4-TIy5e9wQawUdB?^sV3b zp6}H8v3X{#z4qGc^6a(N{y+aU*)+XkhGH@mMKSS-Vy`ii{7z6LzgKM+Un-kvRP|{6c1CpjD=$Xu9rnAJrw!7>Jwk4_${MFqXiBjEy-N+uB9gz@gVs zS=+YA^LL+uBFJ`W^2?C>(RHuM>t1_QoHtOW9&9stoq`i*bH6s3fzdA2Z*?|OyPRz~ zjlP*#*}=!fn_M?Mw!8}JrcFawP~i$>b5IOhgVn^|&6)f~n)KTyA{QSYeFyt1ji zwW+0J&#Nr{KxX~Yw)VOu%U8B`vb!_u+n09idMtBk`|_1-bsbGhmo>F^);0UrG&RX_vpggM9c})W7FpjPXLhVz*08+g=cma$^UB6Y zGFNsqb<{PkMIro4>N+~>mt1-DK<27}eAz=HFf`u>GgsHIYHDs@j>e;QDEj7xzJBwM zGnY3vcQjGUQ@ZC4BIo*+l}&Y5`db_KHLR`Uq0{%rZ;DRWBi3*3k>?Kn^Ic++U2^=j5-EN8R$q#=4f}%iGSnhZdiqhfes;>RxBlVb6W}uiIm3 zXKcl$RZXqmVV9+!-ObLb&;IjYx6j6%u^H(pYhC``J!ctbdAGCbweY`gubp|k;!oOl z*=^^YvD=2+?yUOlp0nw<`{iPy6RUxy&c_g1x}%SmZf#xNF2d3Q3N+DcC%EgRbjGu`*4!&vQqgehSr0h zmi+$Js!LvfzpCNa^LJVO(Iw?eZeQrR?a@!lCVt>vRB*jEKB2(cJwN&R1?vm_@fZAG zvABi~zMhwP*nw1rp=;R~gl#yCk1T-^H<-m-F8s}WN-Y4*xCTwGo=v2ObG>G;2-IKQsug1YAB z%9=@aW#!F?Et!#>S4Y|B$?TW6HMQ4wV&R*e)6~+$SJ+gh?Ch%gj*i(nmflPC={3c5 zIXN>|woEJL<(s|+7j<6V5rt(&F-yyf-nbQsUffHzqx|}@Tk<=1Z{UJe>pwluXgqvPd__yeJcwGmQ`oYZy?#X zW;i(KpE<|Dbl;lgkQw64c@9ZPnby_m{7!aiUG3aSVW&Cj@UrMFhojgXhggy(UX3hT-(pJBJ7!mi zDQ-`=duK$krmaUDT{F^R366@64m*3}cR9mbcMi8&gClIVaF-(Tc3M^a2`0W%dM#q< zjzl8NtZv<4nXMi^671d*ux#tr+}(#NqqiPr(U!iw$UA7W^tlNwmcA@Pl(h)hXI1+6 zJ5!?8w+ms0bhU)Z+YV_FE$DqG5{Ze#gmI6^oY{_^APXMqRV|*Y+4i(9MN7Nd>hyLo zm)FUjImTkm%3)?t+shnjVOE|xnmv7t#h8^|Ryi)hu0BL>Izg6taEq*q!mP&?b3NlV`E0dHGACOur{#7IRbmjW7DG9wTGhFbvSwtp^V21ZK-N>Y?a?~S)4CYJ=GUP zF?UqO&b_*7vp4J*=B4(qs3|?8j+jIHViPM^aKS4I+hk$Es#ju|)x|;_BLp5y2 zp_DE0JIxkT&(n#Pv@}yF)E>9}aKxURM(ufneYhv$7%9Fx5PR1Fw&V7O`pMjx=MMQEQW(=^3mvTRA|HSHQ5>d9C^ z_w&lUymn6;g*_ILw@3B3Qq$sD$Eojv6PU|i<%SXOFEJi@_D@>(>PNo(c#q7l9GD7SL2$iO}o;Z z4n5*@=}cuT;!M;TUR;x0u8`N1)MG*2EM>%&Eyoq0S26WCypA51riGHCTs>j?;;~`1 zVQdPXjRZU!uHJ-Qu5iyz=3MZaHB$)^M!SHCz%<}+V2+5lO$H_->=EZ}c_LoRP=d*> zN?6dd6k%BbL8$OjJ}TYID>sI+v_O2dR%6iXM7VnWQ)wJ9MHH>+O)1zRlWrUqsP1 z60e!bbugmr+!*{S5;-unTfcAWZvFnLTlEK~cIhhJ=dP$A;X{1_lkbTRh9~Fyx+drO zx+kl?(K{w*L>-L^g{)RzqNULve{I6fpyJyUWY-@WzGHpHIXiMQhV9svL3O%V)~P&2 z)TN|aVDWrYFgdg}V^pX+LkV?djQ&9|kh~*5W7Lkk3}uI!vBhdlW7dR!+Z~m$X?Ii! zju|+La1`U1g5x3_`8X!y$ig0;c(;d;7G=if@3(2kvI}@48!5X z;lg3Z;lN?Vq2YLXWPEjOTUvIY_??S5s0 zMGt~Mii4h$$T8*OFVqJw?DRiau*(171#A4jM1CWYUzc_LSe_0>oFhmV<-e3EezP7~ zzsXNL%AQx`y(Iynx~<2O~v9| z8p=-NNz+pz>_Z(W+}I+*Ij9GXGmW|k=b}CoZaON$9@L4(ou+aT_M&bSZfsP7MjcJ4 z<7+6Z(tkYmRiN7ca_lQWOaB|OF9WUp@5a6a)cQY){db_f|I66l0v-K|<2caSuQ`q^ z-JsJvNPS6l%KW9GT!k@*#+r*T)=*s55l!8H`Ghmo;UHjPrhGMFWN9cr?kY0 zVJRZ*l!r8R7e``WsSy2z2((PZ`<P>wNT~wa`#mX@Ku7<`l#xJZzbhpf^Ta)6QO|6 z#{Z9wLy#ypU@^c}x^9y^Wa(WO*uqVxAOL>=$7V=G+acNYnTz z;=p$g=3PqD{Fn%5VBR&_jQGDBJXQ*-9d`iLf^CkyKubZ`aU0NDu+DJ{P%CJ6+yt~2 z1ROU29R&@J9|D~P^BqI`y%_b|DEaxx^MZ3FsAG_Lk2#OZ@f7O!$%d&qr6b;_JS);Y zje4eB^(&pyevWyO(jG-UjdV*ecv9SxsOu8HNcS1$O^Pc=T?f)(VNY=ts4wO9DdyCn z_3c5qm4b4|E}&Wvj@=2g6s(Kg4zw0D#BKv>1@mLK0__FGvE4vN!Q|L3ptB%gr@Y%X zh_SXkBzV~Jp}^Kp1?ukz416H45n~Zu*L?lSd~HOYO4$=}%|Ny6@i-l5DSIR?0JN4J zh-(3AWp~E40_|lt$F%_+Wj}~(2Rh5HjXSCBQ6~A!4a_BU&k~eTK63+e3dKK#GRtRf zU~ZxKr)3?E$~rtL>rgK1P;p8fI#Cv-Y;)WypjviS+!~;ztRrq6&{}q7+$ToC9etBDH(on@ECtp^Wiebx|)avpxitlQ7vti1jVpXgDrd=qGU~GXDyJwetkdy_{owlzRC5ueKaEo=+OP zsSRknzDhoiG)_|-jmzug^GD+_#nITiT0U#>h=ii1CHya$cruG`PHp^#@_rHRbB2qaqE>;k=ySAf21Mp`7EC5YikG zX=om&G&Fw)gLv63uTR^|D!iLnrQ63$$M;RU)e>PZ9-^mu_7N7oyf3)#Es#V&&9Rj_-=Yr1n-+deV`XY1-Xtm^FA^?8z7AE>Ku# zf#U0{V0zzzMZU;_1-@xmpRwM=U1{vP7;4=OVk0`ig;lyZd~-Oz~Lg+C>U0 zl4W6+*9PmF0$_Iwy4{?Ey}iT z3)|Xl33PWCsa@SgO1Qg-1-pv^?{;TmoS^kt7Le9w89-W}jR(^DEDcEOGar!FXC5G} z&%8idpV@&ftYIC%M66-0z+qU!YQW)G!>YiMSi@R?$ymcOU<%f-3eb%;EVmh~TE%Sy zkZw2K(o^>Yb@$Edb@z>HboY-sb@!ekeFA<3K9D>*T)o34Rm^rPwbPnEw&^8XOCDzKc!; zX1&3%&^;C`ewo3o;#kmkoZKLe1&f}gSD#}+_fd8(-Ve_$ywyA!IJ)pg^C;k`!ae2@ zz!8Plo6iBBQ@F#N1WYQt#+(35C>(2!2gVnsnwc~I^>b4aVe80$eRQ(GQ6mM87%uRf zVFHs91tz!z#yf#x3~Pdx%&$E?)dwiUYji5A^`q)5Mn+V0I~ zz&XMr>>GLA*6!(TX&vBq$#r*V(|7Czzi;z~T5j=%+Q8?|UEujn@O(RXz70Iz>dhja zg3mdG;Bzh^_?$-wKIapH&yxw;6gGtrWxj||^7{gCbC%@yEXnUYij(}FHH6<)lHXNB z_+2IWT?Kw$0G`g0{GJ7V=TTVldzR#PmE?Dou2EOLwJO7ez*Qy^7~=I@0NQdzweOzzDe@CNAi1{2!784zh{HrRp9qb@VgBBF5&#n=DfZJ&jay$E$dyd$kSUp z-_tvLuBW%E+C%G>-qLbUZ^?`Se&;P&r(ICL2K-*-4V83yL#6HBP(_Tem^Gp z{gCAMe#!6KCBJW!{JviD`x?pbwSwO>+9kgOg5M<#lHc(TFE{06_B3~P8A!@+U)NgRhgpK~0J z9!0bc!CN06!?6+^OvlG?yIhZ0<$6Sw>k-DsaF17T+?~cT9%Hx*+;##JID*@03U<-} zy$XxJd5Ru?W4<2$qg*|HPnPbydA#nt(Wg6qK?P=g4alC-_PPxd_n?Kpp zo15q9&C2ogj?eV;`Z7Gd-ZW3I+r!r=?n|n*)ag}P$X)3Tc`LjjU%59lzSJAan&A!Q zmUu(?#oo}ABCmtEI)&EZz&*>u+ z#ZEX{tWjL_Jzi0)gh^tJ5|8gbo1&k*DtlsIZ5Wd->d0{skrgur_8B?Qi6(Hv(nGsiL(#c>)l zaV$WeAYEDxNjt}a!!z(s0Q7pu_vcv9Ig)%7j|Dw025KA&S~Z3nV{M|aAUG0-bw*s_ zt!gYVw(v$Z1{hPgN3{WMh1cVq_P>rr7w%BaKy%?WY7{W4a0};koI|WLV(kKBG=Vm& zz-Ws=vnnu3Ner5By{Rz7uQxHov$NEszeWGO)cSu;{~d2VgZ`^1eMM;JBK36oZ}%Zf z#1i!WIuePFM1O<+yAk?t=Na_h($tFw^xtymzt%aG$ zmHsTlBph;97=LHbW=(S*MCP{J!+T9k*G(Y3Dd(LaX<&gAdy1)@2HfXuj{{~g#K%>gi^Tv z8-V`XGB!f-wE6saRmbtriES@7CLn!sWUVbyDoUI1Gl@TYlE!6;?x z#&}nJD1LK1*MCcf>c7i`{#%rTk*2fD7w&BKZSGtL{dYare3>cX;K9{v^oLe=>JP8J zOMhhbDz5uJdRXYaKRqC@ULqJ5}dSRO-AmDRT;{P&WsB- zw*@9`3S{JN(leAz(0^N6q5mdI{nyZQ4SmBWYQYv{U$&THtt zh7N4#!iG-l#kHgx8#=C`>l!+*q5B#-u%QbZI!NitE2l^?_xx{SRDO<$u6m?eA({HTDIm^)PWykw4H+MzIY?&I$1T@!GFx~dJ2T;DTi>M z)PwgkHrddDUCX3hcEz9$JgiCBBZvD3b>NZGHk%CFtf2#^tQ6_oYfrXee*#;W0$bQe zKozzz(uFOsmHh>1h3)KppaxsoyFfc^Yi|P`u(kaO=!EU_Hic?~QFj zzwx51_iy;zANg#E4xA^>rJ?Lx2YyBTnD~L zgtJgLp#z_+3%!Z5DzF#60aRf>Bwg47dm`z=R@fIw7uH~JBwg4J`y=VX4%j0}7k0uv zX{@oy?o0N1L#I7jEc7U{-5*^$R)4VmW3Ipc{RN@R4SRG@r*+K`X@@_isShr>nx}i> zIgxH8Z2qp*;|BENgmRHK8M;J5=u`iLO;TSDiu5Va&J1n&%Sw^fy&w55m74PUK^>6Z zDGdFW^jsBsF6p}#=)0u%TA@oA`mfL_)^UB<0ezTs3@7yBq4oEm{={4G8MXiF1o(`% zV%Wpc29nn(J?C{L#iyYSB)>VY!Eab#pl_fZ2s5PKF&=fLa3<Mz#J~ZgSqzBuf3zPQYfCfUEhZCB};d%SD1M}u#t5iZLvTrKa zZe5$rIty&_vzE)b{(Ja$LjOJR+o<4Y^bChS=Nc8BMXmwUvzVZCaSixr12o{z3QyL6 zw+Qa1VBVxUv|od|<5@Tt^XGsD4E=yOelF%xO4EA12&Z5!9ngUHAP)0v3g%Nv6Sz@? z&&9kupaDatBCel{xtG#3-!8%_n0t+OGg04z!^-|@uIZ}#=W~tMvcG|AzSjK#p#kr2 z=bEs6|2nP_JNAdUX6)R*&GF?`2Tt;V$U_|sJ$B9uf_wR>=bW_%xW4;$%zsp#r%}&= zbkB)&&!VoR+dhH0k+N!M1J(UuuK8N_uj3l9b$>h8gth$vt`Xb!H^h==?ASj) zmNaDN{_(sY%1FLI4no%d_5`C8^Z!8Ks(yvMmF ztj&9bYsB_>2e@YJn0IGfBb8;|%_nKWc_@>7{uasSkNa8Vqm1(T8<%6PDCalft;2N=g z-j!T4cFbE6M;fwo-W7vd@ct+#&7q&IjgBUb_b}FAG)B?)16YR{-;ZJ~#^=$&XOR1k z&@<=?yM!jZeuL10b-%#orCj^vS}|UnBMnPLcyT>PSG;<7{18oesP<}T!Zc=6`_MRT zXu>o;Qyh)Qh9*p7FvZchYw(W7sW&j*kS6>ZkjC0qfi%{>0;I8)#@<0+n{`qX4ohE~ zZ7EM&g)zy=4&J{ZTDPE`?)PIwV$rw{mC0ghiCC;U8L}c4i%teF8ec4!KMBn*iFLU7 zw?K>^<`;k%Bh1f<^D5>|8hg|}5so@&3L0a(@mwhw)zJwD`8vn+H_Ha;^P$lEwc4zc zrq9<*80AHpw=Kq8q+l*G@{=4z44)&)S4$tYr*9R{)U$YQ+og{hjRVw{^xR7h<8FKo zDPAG3>%(|LX^vtHX_MC-+#=#0zD83I&s!K{^aG81GMP9F1qJSvfW-ryHWVJvd#V-ZARf2WcwUP`%w5f z**4FkZH#syepaHM%EHaDRX}y&Rk7tj%fgP>3ZQl2m9aB`XqVVhpnc&LvBf~NOKb_y zx$sgxF47v0?m=riLR!ZW()x{%)@_8eUL&M+8X>LEhL**PJZW8)2BdY_Vj!){JV08P zEdtWIYyptgW%GfwE~^F7x{UH1x+bRZ(Dg8dhpvSwe7be;T;xgX;9)>o2hRr5I=C80 z>);|Ft%END(mHqwkk-MIfwT^mV{8`Ty^M`Fd>~$`7j5-2+K8SFD#LH*iT3#Sg<@>R z`oc%oVQh05*S#=Hq&YqZ^IH(U6Q#@3HnSY>W|rkHfWN~%)AkJbP8^uGLL2a($YE*5 zeV!XD@_0U7Q=P54-j$V=)&$#+$MFa5L(zcm+=oIBcpAA6MZFqkPYEB2XBF5agNNx^ z;{CLu$Ikm{#Z7iM>TFpN7X7qhn>`s}ZN<9S6rg=Yd+czaV?`i#B+$8{AvW}^$H!3b zr_mOK&o(J)pYGQCp2d6hv&{p(8Cm4lpu;!g$zS-op8UBl{N&Gk;intD z@WYsIjEmoe9|rkf3};8wEbfPKdt<*KH-;DdOF#l;lsdn$Tv+U^4FhV^IPqTF~_wD zkI&Xg2MbmG)XV)1pj%CNtV*YT<9-Ne*O2=s-U-Ny>t>{%9ej9DKSMh1w}6={bPzh3 z`=FQmGe8#$Em#1}Y>}7yIiSp;+FIym^S#`k0XiA?XTbfqKLhSV92Z(?fyqf_h7ZAc z9Mer198*oaF7P1$k#RoWq{AK%Usw-~lfpLBQiPp_zf{HbCagX_fAQd0MQuj>d4-Rv zR{%2#mzn1S(+mCPc|c#`TWT#ZzVI=%0O%}yi1X&*bmgRRG7--aTQQ%HP0EV-b|=E> zit1Pw(6XYO`?*+GDBRCQTk)lxA>O{?BfA20ta#VXV5?ZXS7=C!Z(-;|gl`ZU(&AHm zTJWrmCk+V*7cGu<(vT3g8h#&vdE}#U`-_2vLQ~17@3RyolW>mERL0}GAKFU>;b@_$ z_~^UTUi{Er$nPU?h4A+X)G10ng=;7u;N^-k9_WYWl0gW~#YZSL7e6!?uBpIp!$Ez} z`H|38+TIl!$$~e8#!~yT&`M?>7g|fzvr;oTinUYz>v|mY3E)zxb2P__@D)<;m>(m; z^Q7)kYZKuD(m(KQefnZffgt+2$Cm7E`$R6KnL`Uw7)c&$Q{eziQEQpV0N`)L!tt zz?cdA7T*iZ4fNbrUwe~QbIlD}NdF;xFLwXG<9mUzpEyRo7c;>z;?PXVG4j2b362qm zW=f7t!uJZP?^Q^Sve23D3>F^yA+TLih z(Dp`{g|?Su7TTWhy*NKpaCX9Yf#+oiO!o;qqsAxnXdm%A3tT^m-(JaYx8yh8M>xOX zI5n7#j~(2%2t4P0a=?Kxh4Z^w`d(C>rSHYB7wD@V)qTh32QDEWQ8c-QVt zeXl4Z0<#J2@87FjlorQ+d?*fn9VvL1b=%_AbN{UR!fK+I>J+Qm;lshuJPTg2vP}x@ z-`AMtp#A$myM2poCyO=pcud>Go_$sB%~!Ec-{Ru$f;U*4o`^L?e79OX5zQjLYpRD~ zzrMz>KmSgjY6aTvbgL>b?oKD(so^_ur&W#Bv@Q1Dm|c9XPu9e>$;o1m{&1Ps@Zlbp z-5%6x)Sg6KXTCEZ`$_2fEHxRZB@I_c0;e6VsENkPwQtMMFO}9gj^&hR&9-E8$2& z$b~(`*asi7MW=;=!RTPsk%Vv!_>(Xa@x9Tz5^8FL(YASEEn?$-95s<>>Cch0(}w*e z(bzwq@V=1<&bfdtAX9uB$v-3E97uQo?8XKMG?3EqYTG>HDe9dNN8au6`?+1N<|XpcL|(^XmwV|i!Y=pH`@$~w(!1O)NA|etFvhf(-uBTR7P2K( zzo4)Vp&0iE&L}d>ujP@YWz9VY8s(#USrTwr|Azb~82$w&ja66sOHWkx4u$gTNBx0{r zBKArpVy~17`=nghC*{IEsYK@k-HDF-x)ZTSDiM36T+VyCU5>l3XX*~@nYyie!~Ul- z=Inn`_@j02<6~W<>t2o*qwkl18^yo|U?Z=u$=tx}YcgA$*iYpcvY*O{{Zx)2`>BTR zr5du2>Z*^!fps4T18ebI&i*l;H{?GJ918-AfNKyx2k~UXtGbuk2DlIJ+L#}RH>Hn; zRl@Se!r<0dCoX2#0t76^vX^*N_2^N~tLn#&s%!svm%jGIK7Gv}f1=Og>rk7iileaa z1@8PxyYlw`)YkmRUcL7xKlSwPy~D%pdp`lke)3aqXz$O29d56}*8Co6_T9_VaGM{> z6x#O_r1`m*+vt#HEu}#o+}4M1Xz#t=wUjsFxV?|Z?enfVfw;Tw;dybp-(5(1*Imf# z9xu1^A)N_!8eV4X72;)voo@r$Xjb7&vxrYOtw*@3aHUzCPc>b|&##c@N0`>}^ULJ< zc+*;b-Y?JFO!WL$70!|8O{OdP`B~-wk1sQmeV@2ja=qYQ#r1-VGxmsZPz9C) zEx;0>6*vQ^0gHfkU@_1EqNXxob;|_WfmuKY zFb8M_<^nZf9#94511-SGKum8hP5~;wi-0takHb6rrpFW|jqsPkW;&L>KcpyL!bgS8 zl=ft7+)w)_fcFcVDea%zcsq^nz`GS?9NynIJx+N64=c)8!bd0{;J*obz{Ue&55~p^ z#ooD%cZ&D^jX$RA@xEXACH9H{D?Val3CrJQUc%Bh=ucQ23(k01`1F??XQ>n}ewMij zi;gm~m2Mc9g!w-I_4R22uNo_Gomb%6F#^|k1YVgcFyIDae8o7MCCAw+gWJ5%D`cNz z^x=J8BKy4PYkkgb)aYYwqXv3qpS$@Qv!PLpyNjDSW@G%z!T6U2oCu`7YE{i*uUb{3 z!m|ImUeEqujh_8pr=I;zo1XR87Cq|&UC(;2QO|m3u|AFJ4tptzA?&3?_g)%%)LA}maVXxc4N<47xt`$v1cvl<$KkzH;wOA zTO;?Xb;`YJZE~*~G%UVXO_zJsP-ebYZLzSIP6Ri~q`kCE+DmgNPTEV$<_LaeBd)Ye zaISQw;8zyKVQoSDDxIz<6Tzc0X)i5Xfuxijwv*oQ-P_4Ccang{7k{u3F8Hxmmx6SCvcqP{aDHO zF_P!0lHZNAS50aH4dD7o9EXk6P2(7Bq@3fYAJ@3e6x`-EQy|}440Lji7fYLI(G1RU z+K=f0w`sre7@&tZK11wRt17{MwZFn<`T=aF@4;sJ4s51>h0XK>*i7Gp&GeljeIn=f zHQ*s}{OoL|Sxeis?4~yCSHu3a66{Yag^jdAhxXPCy{+*(->+6sAod#`(Sm<8h_(^ut4!h*IRdO6N`eNcd``S2emvc9_n}XZiZVEj8 zel>8L?^ojpj;D!tfGRKctNjsn)Bk|o^!Kovo`Bu-kFcBm2kfT5hu!qV_jc2Njs0qZ zr?GZ{F`7V|RbaG5pjj0dr6dJU-mixEv)!)-o_?pFZ?S*6EN8NRD#HFb^!HD{&Hjns zKRsLf=f~!M!v48?sQoil+CQ_f=dBWZ-pKyx^2SN~=V7L$?PboiFw05x4(xf$!k#x_ z4-GQ&LBl3$!Fx0R&C@KmcU*QR_Pix?o2WV764;w#hFz%_cF_YketZaf=Z~KQ+h~+I z)30Pl`y%dgvPH2SndZ4yXHNBoEv}R7r9Uz|`EQ*$m0tfSUP2gw$iych^OYjQ0RcGl!75wCOtVP{P)73Y-}5ueZqM)yfoKxH1`g)ZBcJ+U821FYQvV?!L50PrbBQ_gq@S_q!#{(D?71j<`wS z$eV4!aA~fut2D>gU7F$ZW_vQ|H>Ji*n-CM1os!|mPR>Zo_VImfuAC^|F78Z$sTl%2 zX;HzEg~b^o3X3w53NOltpWqF|o$m=a&P@p6 z->M~E#I+_N4mKM)-)IwY&9N~9wi5+wAhK^L6Qa01Lj{t3!veerXa!CIYQV`rJ1`&U z0OkRmz~EQxU?gAoct`T2e|Mhn?;e@$%ovgF$VkdoGtNo32dopEf#?a2fHFZ1blL6R z+t7AawBxP7Xy7eC1sL(FI*o6{;e-W(qh?QZkbJf+uufCFIR)|LCq5ELe&Vp(6Zh5G zK++g0L^#R?L_6Mwy*q@rGS)^2SF(YuZcOc&R(3KamUX&rHK_d;#`7P#-LUkFnu5J{$hC6gRgXW*DSLzawYf?`ekL zcm@1tDNQAOZVkV2*Gv(Y2p`)S7o?x$H(pYNzD!Txe$Z-q4EOuBq({XTBi@?+f7_=3 zwe*ke+z&YYJ$pXFj`Tm;xgT))@9lK2#gbzU;24E#CBNo_UlgvA+^Pn*C|oLeRelPu zb}_*#;tBC;Cy;ox9Z0;|1|(i>1ro2ifyApWAn}Uw_Q-bgO5f}h*=BCE8J!;~+b3E2 zBoC8qGaPMW*awN@xkc127d!HRD(aa9w4`6?$N^f@CpyLhwe<5GnLvB`7>5t&NFU|M z06NnXxIg`3lX&(T%(7iR6lIwIslck&1y;W$u=JR~^51Fdo@ucs_0w*K^2A1IX+Ra& z1GE6C&8$FbD-B3(Wd~AQIe^qwPM{nY7E4~&8qc`ri1cmmAN1Ewc}SGQ{U~?_-h=;D zlK-W~bt7*VX_D_Ayl&X_B5wE|O}*&?+ezj37NPvM8sYnHyIJ^D+b(r@5O1|z>_`S` zwhJBHr`k5r!F{T2=Q+4fwQUS<&-JoBH_GrfKukRS0vR@VDRfgxM!TANqp+($?oMw#fcz zMgN4p_#0*_jES<3n$br?`O=1Vrv4$jD)mn*kouYrvH^-m*^_Yd-3 zfWBbBMbK;d-gfJKU-bDRUmWsH!?3o1^qse%O}+oNuj~Ck`NHqN=?j1M3-3yle{K9b zLX(0X#5JkEVFY^rGoeYn|B29~-v7u~jC}gu|J#7Ccy*t`xF&T-fgnd()C%)ruFb44 zuSUMB=wnhC94mCqkJ+`}TQ5x9kjMAq3LcI()nFcuFDx7I53fepSvcJcqY17vnPwq;S>ZJ3ME`ZHvhYLmWx$#653dAP z6gHV>0^)=~ov{q*?Y5g^6`;fRgIHQytuCQ=1Yi-vGXR_-Y&NU&PoA6cd`%&n4KSbd79jWK z1aklIV%XZ|uT_+3gd2pNu86*`P?U=ZR|-4b6nw88A^gZ|M+-Y0`H|Pgi{Iv{O;VI% z*p}yCMR@@?DM}Gx2jv4?tteA~wWEapchQra12+C;d5>zF9cW*!bqsB9{#>?8t)uNMwT{oBb$kk~<6~$YA42Q+99qYx z&^kVb*74zD{bJ%Lw2oqq@C`o;t)uAb9&Or||Atxz_(i|{R4Vz!wGPRzQt<1nv<~o# ze8Wq@FRpb+ewBh>=ksUzpY#nsgVuppyJJCU9d^Ru(l;Dhhn2$4Z`V46?`*8}4Yx_( zaI^FcKRc}>PV)X6?P=imBDRV9hA%ip=UDBP{B}!zL-$xr;|g?-!RLUFA6(}Euer|g zHGY4SZ#d?g)9M_bL+AJuI>*P*IX;BW@i}ykPoZ;s44vb{|6w|ZB@9eOis<9cUbX zhQ{$0G>#9Tal8YK*vbL3-P3O0?tXejqt$lf*X-TKMefg3Jd6^S4v)nnkxsxWjn{#sO z>nAqk))!n1IPB|f9UUWpYHF6|D%7e`yc&# z<1Djvd1n*LY+T#gv342DT-v%aQ?KvPnf#yPJKI@i`|`&6&U%*F*|esUr>FKEk)!w~@N|D~`*iW& z`qzcO|2y3D=c>Q|JKW#@9quc?hhL47>#|VnaX1{Aa|`wKS(1*BkS!f%$X;d}yAfuSRyp{2Pb3&mxiK#`VTLYhVk` z)5kb(TrXl7{eCvj+clAVp59QE2Grl>8>)99=7yD+BX$ z+ZJ(=QGa8dW3-%E7o^JWQ$#L4)N%ApF3#qlU(}a&(YmU-u4PSg8@oHRX4Q(-yRN+_vt{}6)eV%eI&f9PuMa$sxwfWh zO+)n+f#&*qYd2K$*v~UptZG@)TD`hq#ma`}w(8}9^$qpaRNyxS>%WSG?Ua_WTMZ>+9wKg4!98F!>(okKyx_WgRj+RrsHoH1ICwILJ)h+e))lDret-r`@ zT@_ec)7CIl!ISl84`i+$YL#V=h{)>JKvR=!sCzP3Z&+E|(sb1Oz0Q?+Mg6~Iu36o% zy1IS?su-xFy478A2+cZlnqU9r5fMA}JpY`zu4Zk+@>MNpUTUX;|DJa&?X0cXu(qN3 z?|947zu(Ee*FF1R|G4*T!daV;wfzUSG#&t1FfANQu6eYoNs?eBTl&OPg0 zJLPoW=eGTN&-b}+56LO64O34;+s_eOQP(^XKml{##EC1Gub(<8N1qDynZ+RIS4@}R z_@jB;8HEWoRqTGuC2#^@Toh(@xvtRX--mI$IHVVh5cvuOFouO3r3X(-KNsoz_!~vW z=aK|2mB(SH<==vIp1(-s`-@y>y{4xx7wHG$1y;)Q_MLW|Af+=a)7MSaNI{e95Xn{kp*FhS@C*mo%(gS=iK4cg5Vgnx=+J%jQ;RpWoKdyt-vobyLlTmNjjY zRv<@iMa#O`HH|H+W|qy(sm{*6xTR^$%4R*KdUEBORunFodw%uInKSW!QDJWN+zYCg zFE5)rx%$$Ql|7}J4nIX6zNNKcRZSb#HQ6~0O$~evMRm%~F0Wa=y3&uu)e8U2 zxrNm^IkVR^%_!vco4Fo^+7>hfR`}aW%jQn4o>JeiHc;1)olDm-x3!@zfQ8mo{ID~# zW|jtWCQrft*%Wee&-OphM*p3kqyIOF^5cT5bLTc(xu&7H4p&$jpri0S*|;d|orUWY zXW-|f#JR=QI5U;jk6KInvS!Y`q#FCab+VsS|sP)ppjp?@*|(I zt*y(Rnq6~dU;QB|1tRkSy-cFMaVB1bq_>oRW*rzHhQ^Zukxwt*H|flBddhTc{QGy5 z56;(fpNTXj{M&bwFSz2&a(W6)Er(Alug@oXo_{6i`t^?q_}2`I(TRF=7Uk}j$W2v?c+;uleKRd<>?vU6=I6p6Yks@QOs$Z zkVa*aO{ULaQ&bYI54= zm={eWlNvRL%^P;4XwmGaz0vG?gr43hn0kzsvFde6j`>@Xrf5AjyC)bbf5XnqBO+}x zyg^ji7Uw9n;m$mFM%?JWiJye41@O z?2KjX8K48$3$z0d0&PIk0%y0?{QiIibuj~@Q5O?13UwK2FoxrBerMFS8=X<+?x)v8 z`=42JvA=gsga6=~fImLY6-+RU3mUPsg8LHOOA^RP1u`!ecIl%>!T;zy>a2V zE^l&j$bg%WYPb92`r|^$TavpBTMS(ZaS7eYhPZAE%33GWT59uQGtdMy0TrMEOjZot z_GEi^f-@l$7v<~@JAsWX>qrf&ov8^{YsiKEH4^>H*)y`s8SdW69E;zM&QyXd z9_Rxm0n>mfK#vkkai#>vjvN~v-xJkkiZ_MqVf)UwuxqCwY}grfYt;2rKFVnb5{|PA z%&`ibWFjnoyKi(9$NnUNheio}HZcm_(sa%EsHjlXwNYJ^H%f^P(!IT*;E^Lrw&GYt zI8nh+(tC7@ld({4nm3%A;k`E3>y0SdHPjE7!FeE};pvUuj_H1H=k$8-IT!kTQHP>JA+yQ5Jx|qw zN=AE-?dlzQVVW=T!VF*hg5Q-3HArv4KBIF@VN61B(hA;_X3PKh_4#NA_+!*yC!p8`oB7BZ; z0^tjUKO_7F;cJBdMEENLLm34@MKB?l5v&L{1UrHQ!P!1CMr~Bkj}^;NTwgrW648FW zM-@9`H%28Po{W%!;A)?|!reG&g{x6rVQw_0r!?kV=} z-o}osEUhCmQ|rh{(ULCM9vF4Oj)47woq@#hDUES4*2Wv7G~`cf@9=rFDD;UjCWUpR zrD-qVc=47?)nAP74s^IZT8GQ6b(mdR)MT?)nPBoOEHa>6T#EfS1b#8@#=tMm*%SB; zufwHkhxL}!i21ev%fMVg*8pS5%g7(K>nBXvW%NgGxjjG}B{`2IplM|>8=fzxRJ zuNh0EIP#a!e#ECx9GDZi?STM09*JzS?>MNiua0ECsVKzRS3XvP6wf+lRU@~(>1Xd_ zphtT1aV2;)`W@}_y+Zq5@w1P`zQ#91nT(@W_2q4^_#?Od&d)v<`>uFd3DVp{*L;mE zpY|K8x(?gaXB+VlM#uCHSkUga)94^Z~&%5w8K=94lbZ&I2Xz&-7^vC zxTmpHsS)Ez)t1-XFz6_5j*NVlqlzUu^L>=KxEwLou3pl78(?Mkbv z*9EJcgKcPIW$^IWwLo?7^|9-Lronf{UIjD{emHg$&^mZx>@`5!;MZe=K>MKLY6dz6 ztu9^{ESO}USfq2F*KHfxLG%sY7r^o9ysm4I4$h;!IQP+h$&{Y4LDqe@tovSB_Xb(_ z!0@_vpgoj9r@a%X4kp{T0!@P+`*xst&}ZKPv<~LjcLHsL)9hV9`(UAc8qhIVV$Z`o z^~9A?jwkLMv_5h7Uh5P09J3mjGsJ>-23Yv+fEn}8z)zxrgojHo|8?9GV4YX;^dpl+ z`jeBQf?sW0;g@}y_ibFKGjiKZKl>sQaVkyiRQ`#a=%C)OKYm5@YY*mdy6#WWM|u9g zi2mupoUZrl-?+%CZjycZe2iX{|E(ETHAwv! z^B=`uzSXLpf9uVD%18ZpcCEg^)mmzW=Qo*1YPqs^4TMfS&l>fmAfqd?Q(OZEdm z^WY)-!$9lcQ}zdew!ug3_XF*N_uKCUItK6Hoc;K5!P!rLF7D@X$=N?h&c3l(q`w8; z@H$EkzX}fPb>W=FF_mM7W7h)JW3R`q2bzw(6MGfVeC)&6O+f3h6S3C-ZO6Wj4Fc`Q z6nhxxIQCTR?3=H~{PES1k5?*+e*M#7lcKaw(Lbi|TIr{@`T}bMN;{5mLC-g(Lgf3y zRMAdXV%?y(KjnRc#srKJr_Ljk{#Ld~KXpE#^p{iVKFWDRj3M(W{Z;g9O2-(IEXR;U zy$xtg<=o#OxxYbje}m-y2Fd*mlKUGZ_cs*UyQvK}d}yO{e0(fQ@3+oV`|YQLF?zq9 zPU$p9QU2otNEiLKknknUX_WqlS7P}6dZS<9Th9x8`4NGyJ{~jd{_*i|?@eO-+uJS1 zzrEYV__sGK#=pH+iScjmDlz`;ZM5&9y6>&9uLoyot(mi1tTl52isQE{R6ot}%FW}W zWnU~IqYB$(&9@tD zB8}!*N~3vIzwgBTMvS{Omo@=uE?o|!xzrD&xwIBYb7?)0=F+t|?^0~je7X!s^J)7_ z;vAWnOX(ame-hGsNl0^K`-`$};6A0%9N7MXtQ$B^X*AER!SVI_(KxXP_lV}U#Xy?d z<^ySNs{+#8Rsp2BtrAFcTN#k%wsIiNZSDUm>vqcg*8c2{gHD_;6Y~sV*0vH+Upaoz zoY#Ix};oAyY0{9Loia>7^Tj1MvwU_E%uCO%1QrZVZ&g2DELt`Y}8&891XDv{o z=JC4Tu1@3iyIsxY@xAI49^b1@;&FqT!{Y`uK4D?lY3uEDTA%K8nxE=)nx5=*s!wz} zmB%`r?9onVE2Fri2!a7C;CSx6A0;XUc!+linlo2LIW4Fr!H3_Um8=;`U|_Vn!7>FL?J-P6<2>FEiEJw2PQ@$?iFa=Xu- zXMSxB6}+T{Hoc&Qg1^*49nWc@t%tPGjy^5a)vJYWcv>r9@BX&x289(8b}6igaEF2i zDBxCw6%%$ScyIy+6;?`!>n$TJfaMJ1L_WLy$nvR*l0tZqqR_Z7owl z(RR9`#1l?L8lH#Tv^`FN;R1;9=VH1(WvZc&W42)i$8kT4J^ptW*h}{b`jEjbVj!6H4jx2-g|9QXHw*;5hZ(PT!As>O7->_EB4#-<1y zlPqv_lE6`k0u$l|j&ur)a{w{^^{dmXsD4crB1kAbuQ4$i*09^7r?+=V?E-0f%I$l3K* zd3xH`4srL=U3Y=IcW9xueOhS!ZQ$@N;P8#$@D1Q_H#pp-Wf2F#)f_@_brK=CI)xBi z%_Rg^rxC7KSRP@U0;2&?a`!^;u0nFRLUMN+rAh8qoWk7-$=!-mxLYB)TLJD~2o6<9 z?p1($(a8~{D6(-c#MtWc$6h@9AG0k zzDO?1$lp&7$^*f*Pf6}QD!A8nzvSK>l6yBv?sZGd!Y--Elk#9gduh`&=<`;?Qs1b;Ine|?g_ zX_CL5zm31HA^t7_e;0zkmEdnV_*)A87K6VyI_&cR$J@~s;3l+6#q2g6qsI*lJl{agl$|fyTRjY*-FVkGa!^H~o5LRKGLx^=q z8X@L%FCpf1A7QZq(;BcyVVQ)IzvIExJjvfY$zKnpN&e=Y!rwf}-@H@!n7t*`CB6STPXQEP4f2>mB#L|`5&k>c8|#)RB7xUoqtTFv3peh zn<|an3He7=8oNj4Kd;i*9hcvaXTQIF?488g9Xfw=So^qP{IyB`!pVcL5Af8=$6u@- zPUhp|uT8EG%yNC8%Jl)`y?)2dn%}Y4?MKG~f2XpZyy>2v+-aVkNmD#MSvj7bj7(3D*XQZc(mXwG4_`Zs zn^A5Jxy!VWR;q=(C0Zz>SPNy%(n6Dpv`}uL7RoEo?8L)7oR=_H!TkVEQgDBOSqknK zFhjxp17byQj|Et7&-=UgqZWaQB)rBXhj*p$I&YpBruGp z0rL9;){tKxa3T5q0p~Cy$4glh#~BRID@dP;`v6SKVYtl$!6YB~3l0P|5Bc;A1Rcp( z&jUS9hTA?6G+P-?j5%LnLEQfl;NXb-Th&-#Z2n#~1{jmSTeSc!`MdBu`o9OF^RH8l zKx6(kH3}G&znOD(gk8-4u{MD*R)H3?z-W^|qbe{;i4Pi3-jE;SK>yCc^Qmu0|DHcw{|+wk264>O^>0}2|7-fU^>5R^ zv;T|w_msM&L;APXVCDLEFZg?&<)QzA{%sESW88FN5p@XY06q(}1D^xhfL7AKp|^AW zn``jUzq$TS`nN^u-cAoOq2u}S|<29o{_`}V2&w>94S z&H6X&=43z5vI{)lDsYmCu>9@k(_sq){?aS(UndBBA?-W#?>SQco-Oq691ljbD!(^e zRqVaCsvI__Dz15x{@uT%)PHEna{seS%KXnQDdD>J^Dhg1`YsNJ2?Wjx?}xU&AKLnUXz%->y=PZ@8nde1jp@~{ zM&F!@#>~r8eE$<;LAvD;lvQyKQ@ds{H+nBEZ|o>5(>h8^wT_Y!tz*`#lUldgtaX&% zlsaL-oV(i-US%+hq-TN~?Yw9iXvjEb>(BRAd6wd*LZe>>FPOV$Q@ z$FB3|-g+)Nr-O9vbj=vcS@ zWZ1e7lR-0s9q89m2mh7S!C#U(_>)ox2Xfu}=cIGH|9TWSh4c#OA6y5={xnKQd>Yx~ zfgalb6k}XNK>keHk9ZF42l@u0zG#np0S(U0m~Kl?d=qOU^w)vUg(i~(n>zWw47_g` z(#o8$ttrt3iMt3&s<0T z-EyI|xnPUe^ZrMf$m@n(zHjqGJm0Ie%8*t@>+Sup>%j*8U)H<-qi46D>XZ18(7Nf~ zl`IwKO@ghyF*vYV+ z)BVML#*z7bUc}n~wseZ)exp^m#skapS4Ui%kuL7*$Aq}AUr@Sxn@GofJTC9+8@e7q zZ9y8hA~kMRYTPENahs*aZIv3g4LSg6Ja*^+LhHVq>xPL}3!NefIv}0Xxkh0827$>d z1*Tjn(A6N&9T={ibkltqEOB)K)xkp7PM~RUnrjEpJecF!4zv#XT&+OcpoeSR_Q7Nq zX)caIC)c_YXM=}se*K*9;Bz6>DQOPc0qJN5u9<H^5!Hq|)Pj58Fkly_t_ZaDd`k!53-Fxcfq`wc(o6WZ(5TmHF3D-gA8?_3qD~5q;N%Ih(Hi zL_gXDebzQ z>)^6p7p%e$l1{4}E#W>3>d`{(!(ci(jr%m1kLGY62kTKE_j#}#Jrqkm5cZ=_#gZ=U zIC_8VqMkK;4EcPe(72hQ_vkd(TB!~v&`(~){abX;Patw)48av{fkLC47 zAEG=fu|Cl60l$vfxL+6_PMs$x{T=EDNI!MHp!Cg_<|2j{&pO7PzJzXb1%?+V@450$+b-xX#VT zz8ax#t2IL3Hq{7y+gu~`ZEKCtw{0~--?rD(+R4|#QFBt~_AL=>O5Z}oLG$^Gw_g@b zI`%=V6$zigT9Mj{wx7m&QQywSnvv4Z#hQ`N@hAIhraspWY1~`42`#&Gi@=V}0>jsG zByD^2DcUy8$J8b?59``C&A*gJ^RBLK(>zOQG`H&7HqED$MsuisuZi=dWz)P#S~h9h zq-B$~O-(&P&g*aaa@_86zpJ7t;20ab)t=>0e zpFk--rCApS_|lPq5DbNKF$54?bF;} z0;IXW6i9RbEFjJO#lUZx`{g)MDCgVxq}$PX?%yoi66d2l=%0nu{g z53JD3H_JS&G7skJq4JG#UlaDdxN+OCw&mBeZCWh9o^6G({Cc+eV)^xK%ZcUJvn@H6 zU(YsAY$2}8zOB|?1axex_`9xUri1gemMH+zT4oxM)-riOTFXoU(pn}LXv12j6G&^B z4xql4xnJJr2XUXNjnR&L4GtV?>-`Pl-rgG!*nGFZ*84;mw4J_gA8Ox5S*8hP4sHw# zYs2Yi3nlQitpKP7l-OxNQ{Y2e9?%>(VVeT9241)20&RhJY&k%CAlW_%=md`U`Rp9c3CaepF)UEqV+in`C8<=di-PIPlA+OYW5dV)ApA*=!=FUA{Yi?y*0?{( zAEZCY`_iA}L+MZQvGgbT6qg45U;&;i7W{eZ)k#!$Zo3DOdwZ;C26pWu3 zM;}pC@^eXIHL91_uSQMd@rA0!;|o=BEi+V6{~7;(`ID&QxRx@CY=O{HsQoK(y`#R$ z@YD4*r;6)rO;^-WUuynQpLqSFK7zJFwtM(t&^s7o$Eh)+K5+Z_JwW?&p9!>KC@V(` zxuVObCsN{8GYFqY6W#;u_TAuZ5Ph=!DRssy-B&Qr45R;%$z#8*lSjvds&JugA{T=3K+9tl<65i+-1me#d=C;9KYYYKPzZx^~;J~`>dgI_>NS69k5&T`2g zFQrNTl!HI#f)C|VgD(evd=!@&d^z}&PI0MmmV-ZDic9{KgFol+KKboFB%EK$QK@m_ zbxfZBMX7O0ACmc}`H)ne^dYfR{)(HV#@Q`+SiW8IPxm3YNcxaWl|Cesr4LEAQRti# zjY8*4Hwv9I)hKk%@8?5uR=pFPO=In8I)63R&exgZT_f;!y5#RP$=@lGzYx++@|=$i z-#TV+{+5<-{%YWFD)`HNNU&zmzN+_YUoP`&pDp%lpH%s^k1GAz2jzb4y;8q6SnS7V z9E1-^{c=ywH}IDGgu<7E_*)o&4@oh6NJ`;DQVt)IO8Age!G~lqd`OlxY9XA8`;gR2 z9}@7D`;dUI+=m2w!k)b^4=Hr+-E2^aFS{`feYR@8U1kdr3Nf zvCbRjLvl472l@Exko?6g%jZ0-;!fi)_aUJIa zhvXBib3TF($p`Quc@I7$SckYde}D8L`O$~uJA6n+oc#X5QiIY5+Tr^LChq@Xg8zrd zNd6yNd+GZEpfY1|`u7hk-2cNI-oEqH?;p&v+JcwF#D+~h(Oo9&E3#UHGh<@H&&y+* z;Q#SUCeB0s0KOTW`u7i_%jx?EJ2p4kc6Qp`J9?vx@c+Pf7rN-Ys;O@ufWB|w1~Q}g z4njec(pNxnq*oE91bbP81vBo8L<|uF_xqSrZtND{Jg9`vM=^Xp%vvY2!{=j`_IXsw zfgWbXcMi(j5%_A5Ww%e()r#jL#TaMksI+O?yds~;s(Qm#w>Ef!hw7G>+l23QRExOjh92fNvAb$9F9j;JX&|+?xbPwnShe za3nAu7!Py;nc};Kd@@EkseJj{iDLwgOcod~zg3VhS`E@WVhqn;shDI##Tni(Tp+G3 zcEJZJ{y=1whueEhWhvf+EG=yx^QO6^KNRQ9`3I~K^r2-qWk*>#>_D0Zf9Jzy`>kqK zX#UO-?03DUGPi%s`L9`%&wtsPbpCIxiSsgi3GoY` zw<9mt+nG1T>zbG2i)a-+j0_T7B;ep zMU4i-tVaG`O$ux}hu`<^IQ$3i_QS`$TMxhE?L7Rpx8v~Z-tgg9z1K}In7?KEByH@x zEMLmJOy3yPkL*9%oc-z=)NjJbc`CjCGxsp;HLwHF`$QFK*v4}``EXXi?nBT1WP_-{ z8f_e&qsLixs~tr}+b{=OdBw2C%}e8pFWac=CpTEQ6m)sd1lVUFbiffXwS z7SsqVYzPmt;p8#A3-yR%oeuN>bAcLg8ZZqw3Frk*0s4ShzzkpxFcV1kgU(yDSJwMh z)CbpyYb&B_1C~iSS2uY1XL3fChxP#Kg3R+0UNj$6D})VqX^ zi#A)2Ep7i|K^1&P2CRh%eC$(F} z;V;CC71wxg19uAWd^yAioya0Lj)@4jZBF#5`U9{#NQMk@z(`R0kT&Vs|%@s3tz&$1Qr~^IG1~D!Gr{?Gjfm3PZKyVRbZtiaPD}46&`_e#tAHU12L8_XYB=q z^{jn{-fw>(XEG#bypl7T*c#|ZSvi>R%y3vl6FhK zv|H9oyX7)r_cDL5yS1mh`?id=XAk3ViX8KjBV)zx%x^zk3kxzP*cg-;Uwk zx8LL4x8LF2w<86946VX$`90o!%dTm~yKk^l7Qs$g3_B%EpP};QuwT~y-QRu7ohsft z&b>(Rm&UhTX}8RsEci>~TyDB}?>HA)F10I-dAZ}nd&jw{Vr$>Bi*QCNRb-&|(%CZ4ziy1x6_e!Qt<|A^q?D?i)DGzQg`G!Sw&m{%JOyV*fNr z`)9PN&rS7KPqTj_y^7Lx`{xa(+CLMe{c}3(pYx^t(+&HlJ@xabaRcQp#LI_R}O`zvT8)X-nnyQ)%Ng+h9MH-(IzvJ#N?(?DW0Y zAzLS0VPHqK!)|GV9n}VmfgP3Xs+K#GxLq|G_EgV|DL%5Jl1(+%8Us73wd)3~H^$&w zkK%i;@*Ej9`MpyWKAvELp}fxUC=j<}s;rLEO|j=kG?n^V|Y zl@s<641 z1Igyv1SFelJrFk6B$gB$Z5SPnvD-t|aN9*GKbJPyam)hYur0qc5fqYwmBY-~z z+JVXEo#VH|UO6JB$`r(!V+5X{caM#S4X5|0-2|V#_&z_i7wyAC^~rSzu4&X_mGfX! zB@jFqRSE2UkQ~`e`zfajs-;6_i7n#ywYubi)st9+;cd>xQ?Y|WMvTb-? zBHziim*N{*&x^G3i-bRA#SEl9C(@cGi?rscNW-%}m8qX7(w1ir_xmk^O+a}>vCaaj zzyhENSO_!&gFq{gY#lb>H9$LX6VL&qy8e{v?&jzs#5GirEewb!%*a4ZKN<=7w$imC z9oMvs@*(b_IIxuv$MNfHI{h6kd<=#ClTr24x6!D*75H7yF%{e!;uqZ$JTx-pT%*5D z&x40L{W!ACk%PB+rtagtxIwh#Quw{tYofe8iqac_7z+?!0@$GcSe%a);> z?&Iy0<>FDUD|*Q=A6oJ!R+c!qzp=U`Iko_C(-I}N5NKXvjm-mEmnin>K--e9xi77K z$=BT1*0JOSe-de6^xzsWYG%oesO6-c%z z6Oe3EW?%-;3d{uBfOMbjKp)TnlpN*v8}~O^_tRdC=Oa4rQt(UiH<9iErBzDq&6oRd zzxjKHz(l=#sl28#T$AKfnqJ;1_r+u1&gj|0+ILa`UC&|ei>%Im-Ij&8Y4#!Ri)^0# zl8ttc8D z_O6%xZaMnhsebetLw$BB)-<3BbOTL556}$c{R!wo+y>*m2F%`ZHzL!Pl`TOte0`hJn_i0J$mA>cGL<`?}^>EG@v^1 zCYu{*ni#fufaZzYZ7D$OM8)m`+9qCQO9I*_el2{YCrY0FQMTikvK{+nJ3fy(p)TOd zGTGnDBld}Dtqd}``2);9AEzzDc8B>h=y!?+J3I8dIFqj5(e*VQ6US;EwBnocU-~!4 z8mz0hcDGRNdH+LCMEP9kbQymTI^7A|mmX49XG3x%^6^BmxcUSb{JKON%=K9`K z3e)|!Ntf&CJLKtky4Tb53n@c^P5lGPYcbjlD3~z7-@cI)AfY@R$~q4FhqaaC9>%zq<5pPRpPutm`;AaMU-!u^{<@F$4fE@c z8+XWBM}FPVUr3kbe%LhqA;0drGyJ-5 z(dx*r`?+6wdiLDJ%W@5856bR=uc~{$&~TnZ{JA}6_3N%B+Z_D5No$GApFQN)O`3}% z|578C)7U@SK-xspDpR0W^{RL$QfKrGJsl0aW{+5`NwNkH)HqoBQvNH36;tcf>NFt^cN2 z1!(W@j-_?L#v)-W+E_sM4RPo_%Lt+O==YfBb@a77veg4~-2%B^H{#r{yAU?MnhlCF zgYar$A1t8lE9tv{z%{}?m`B^A>3eO!vBEx>OWSec8$>k;ic$#sU(H537O-7W3J6z= z?+ezfQDL{Ue%+g-U-x?H*WD)lx?6|) zb#Ab?c=f-5igTAkl7XyFC|-uu+QZZ!ZtTd8`(HXws)3n?+n@A z-ZR>pYa?i1@9FK$`&{d{Xd{1uHu6VkBcDMV`2^a?N6C8yO)v{f%}!aCQ-E_Y!xjPSZs;YLdHd$zA9q ze5`;@a@ttIbrEov>mtCRv4V5AypY>=_%|&vUkvFYe}XRZN9ZD-K^OT1y2wY+MLvKo z@*Z@N!T$!jh~#dr?tqDn|89vA_~l?f}L4?*%=hj1NIw zQ%&;+LaSqSG6=Y)U~-+wbV_Vl6ih+mOE$C zq{)-r%X22x)SO>Csb*^S@+l1sIm_$nvnNlPAvZr4G|DUlfNdKn4r~R_ltE1K72`fFDEnf#y9+g7p6RW0>3Z8a>j ztzmr|&t0=JP>1A(rd2J~tu^)af#ww~vv&1rmWkVcN%_3Yy49it?lAo+YLi6MGMPj%)azu zIjsH7K;vsP0){*O%at2CPNJQs{PcNAPomv4rRnAM`AO8lqKi!|*1 zITC5n%j@%>j^#4HzF#jdli6r&(6ROX`h2P5GMQ1|uh(Bkth;GQA1bfU&pPV!vpfvH z27Q}w0s>u^etdo2(NUGxPkHHA!2dt_-+9=iEc*I^9QF7wPQY_8`VRY@cD%{hNoDC8 z=%>%0FI+3)tpa!((Wc&hSW5HXbeX1?&%H~e=XMBuL0`y-=o!~P`}FdU4KIIMb, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<2, 1, 1, 1>, - ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, false, false, A0DataType>; + ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, false, false, A0DataType>; #endif // clang-format on @@ -180,11 +180,11 @@ int main(int argc, char* argv[]) // experts = 8 // per expert: - constexpr ck::index_t valid_tile_num = 52; - constexpr ck::index_t sorted_tile_num = valid_tile_num + 3; + constexpr ck::index_t valid_tile_num = 13; //13 for 128; 52 for 32; 4096 for ds // > token * topk / MPerBlock + constexpr ck::index_t sorted_tile_num = valid_tile_num;// + 3; ck::index_t sorted_size = sorted_tile_num * MPerBlock; ck::index_t valid_size = valid_tile_num * MPerBlock; -#if 0 +#if 1 // GEMM shape ck::index_t N = 6144; ck::index_t K = 4096; @@ -249,14 +249,22 @@ int main(int argc, char* argv[]) // int eids[] = {0, 1, 2, 3, 4, 5, 6, 7}; //, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2} //int eids[] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 3, 3, 3}; int eids[sorted_tile_num]{}; + int e_select = 0; for(int i = 0; i < sorted_tile_num; i++) { if (i < valid_tile_num){ - eids[i] = std::rand() % experts; + eids[i] = e_select; + //std::rand() % experts; } else{ eids[i] = 3; } + if (i > ((e_select + 1) * (sorted_tile_num / experts))){ + e_select++; + if (e_select >= experts){ + e_select = experts - 1; + } + } } // int eids[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -319,9 +327,9 @@ int main(int argc, char* argv[]) { case 0: break; case 1: - a0_t_k_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + a0_t_k_k.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); a1_t_k_k.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); - b0_e_n_k.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_e_n_k.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); b1_e_n_k.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); d2_e_n.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); break; @@ -445,7 +453,7 @@ int main(int argc, char* argv[]) float gb_per_sec = num_btype / 1.E6 / ave_time; std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec - << " GB/s" << device_op.GetTypeString() << std::endl; + << " GB/s.\n" << device_op.GetTypeString() << std::endl; } if(do_verification) @@ -540,10 +548,14 @@ int main(int argc, char* argv[]) #endif // e_t_n_device_result.savetxt("out.txt"); // e_t_n_host_result.savetxt("ref.txt"); - return ck::utils::check_err( + auto status = ck::utils::check_err( e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2) ? 0 : 1; + if (status == 0){ + printf("Validation Pass.\n"); + } + return status; } return 0; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp index 143f1f85d2..e5b733d1bf 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_gemm_blockscale.hpp @@ -190,18 +190,8 @@ struct DeviceMoeGemmBlockScale #endif hipModule_t module; hipFunction_t kernel_func; - auto status = hipModuleLoad(&module, (std::string(MOE_STAGE2_ASM_DIR) + hsa).c_str()); - if(status != hipSuccess) - { - printf("Failed to load module (%s): %s.\n", hsa.c_str(), hipGetErrorString(status)); - return; - } - status = hipModuleGetFunction(&kernel_func, module, kernel_name.c_str()); - if(hipSuccess != status) - { - printf("Failed to get function (%s): %s.\n", kernel_name.c_str(), hipGetErrorString(status)); - return; - } + hip_check_error(hipModuleLoad(&module, (std::string(MOE_STAGE2_ASM_DIR) + hsa).c_str())); + hip_check_error(hipModuleGetFunction(&kernel_func, module, kernel_name.c_str())); auto arg_size = sizeof(arg); auto arg_ptr = arg; // // RunKernel(impl_ptr); @@ -221,7 +211,7 @@ struct DeviceMoeGemmBlockScale hip_check_error(hipDeviceSynchronize()); hip_check_error(hipEventRecord(start, stream_config.stream_id_)); - status = hipModuleLaunchKernel(kernel_func, + hip_check_error(hipModuleLaunchKernel(kernel_func, gdx, gdy, 1, @@ -231,12 +221,7 @@ struct DeviceMoeGemmBlockScale 0, stream_config.stream_id_, nullptr, - reinterpret_cast(&config)); - if(hipSuccess != status) - { - printf("Failed to Luach Kernel: %s\n", hipGetErrorString(status)); - return; - } + reinterpret_cast(&config))); hip_check_error(hipEventRecord(stop, stream_config.stream_id_)); hip_check_error(hipEventSynchronize(stop)); @@ -249,7 +234,7 @@ struct DeviceMoeGemmBlockScale ave_time = total_time; } else{ - status = hipModuleLaunchKernel(kernel_func, + hip_check_error(hipModuleLaunchKernel(kernel_func, gdx, gdy, 1, @@ -259,12 +244,7 @@ struct DeviceMoeGemmBlockScale 0, stream_config.stream_id_, nullptr, - reinterpret_cast(&config)); - if(hipSuccess != status) - { - printf("Failed to Luach Kernel: %s\n", hipGetErrorString(status)); - return; - } + reinterpret_cast(&config))); } };