mirror of
https://github.com/amd/blis.git
synced 2026-05-13 10:35:38 +00:00
Apply patch from @xrq-phys.
This commit is contained in:
@@ -130,6 +130,13 @@
|
||||
SCALE_COL4(Z12,Z13,Z14,Z15,ZFACTOR) \
|
||||
SCALE_COL4(Z16,Z17,Z18,Z19,ZFACTOR)
|
||||
|
||||
#define GEMM_C_FMLA_UKER(C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,PT,Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZSCALE) \
|
||||
GEMM_FMLA2(C0FH,C0LH,PT,Z0FH,Z0LH,ZSCALE) \
|
||||
GEMM_FMLA2(C1FH,C1LH,PT,Z1FH,Z1LH,ZSCALE) \
|
||||
GEMM_FMLA2(C2FH,C2LH,PT,Z2FH,Z2LH,ZSCALE) \
|
||||
GEMM_FMLA2(C3FH,C3LH,PT,Z3FH,Z3LH,ZSCALE) \
|
||||
GEMM_FMLA2(C4FH,C4LH,PT,Z4FH,Z4LH,ZSCALE)
|
||||
|
||||
#define GEMM_C_FMAD_UKER(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \
|
||||
GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \
|
||||
|
||||
@@ -264,20 +264,17 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
" fmov s28, #0.0 \n\t"
|
||||
" fmov w16, s28 \n\t"
|
||||
" cmp w16, w8 \n\t"
|
||||
" fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN.
|
||||
" b.eq BETA_ZERO_C \n\t"
|
||||
" \n\t"
|
||||
// First half of C is already loaded in this case.
|
||||
GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
// GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7)
|
||||
GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
|
||||
GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
|
||||
" \n\t"
|
||||
" BETA_ZERO_C: \n\t"
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
@@ -286,20 +283,18 @@ GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
|
||||
" incb x8 \n\t"
|
||||
" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
" fmov s28, #0.0 \n\t"
|
||||
" fmov w16, s28 \n\t"
|
||||
" cmp w16, w8 \n\t"
|
||||
" \n\t"
|
||||
" fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN.
|
||||
" b.eq BETA_ZERO_G \n\t"
|
||||
" \n\t"
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
|
||||
GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
|
||||
" \n\t"
|
||||
" BETA_ZERO_G: \n\t"
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
|
||||
GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" b END_EXEC \n\t"
|
||||
|
||||
@@ -252,20 +252,16 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1
|
||||
" \n\t"
|
||||
" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30].
|
||||
" \n\t" // Here used scratch: Z[20-29].
|
||||
" fmov s28, #0.0 \n\t"
|
||||
" fmov w16, s28 \n\t"
|
||||
" cmp w16, w8 \n\t"
|
||||
" fcmp s31, #0.0 \n\t"
|
||||
" b.eq BETA_ZERO_C \n\t"
|
||||
" \n\t"
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
|
||||
GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7)
|
||||
GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
|
||||
" \n\t"
|
||||
" BETA_ZERO_C: \n\t"
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7)
|
||||
GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
|
||||
GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7)
|
||||
" b END_WRITE_MEM \n\t"
|
||||
" \n\t"
|
||||
" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30].
|
||||
@@ -274,20 +270,17 @@ GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7)
|
||||
" incb x8 \n\t"
|
||||
" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip.
|
||||
" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8.
|
||||
" fmov s28, #0.0 \n\t"
|
||||
" fmov w16, s28 \n\t"
|
||||
" cmp w16, w8 \n\t"
|
||||
" b.eq BETA_ZERO_G \n\t"
|
||||
" \n\t"
|
||||
" fcmp s31, #0.0 \n\t"
|
||||
" b.eq BETA_ZERO_G \n\t"
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
|
||||
GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x9,x7,x8,x16)
|
||||
GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
|
||||
GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
|
||||
GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16)
|
||||
GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31)
|
||||
" \n\t"
|
||||
" BETA_ZERO_G: \n\t"
|
||||
" \n\t"
|
||||
GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x5,x7,x8,x16)
|
||||
GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16)
|
||||
GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16)
|
||||
" \n\t"
|
||||
" END_WRITE_MEM: \n\t"
|
||||
" b END_EXEC \n\t"
|
||||
|
||||
Reference in New Issue
Block a user