K-Loop Unrolling for 6x64 SGEMM SUP RV kernels

- Added k-loop unrolling by a factor of 4 to the following SGEMM
  SUP RV kernels:
	- 5x48, 5x32, 5x16
	- 4x64, 4x48, 3x32, 4x16
	- 3x48, 3x32, 3x16
	- 2x64, 2x48, 2x32, 2x16
	- 1x64, 1x48, 1x32, 1x16
	- 6x64n, 5x64n, 3x64n, 2x64n, 1x64n
- Removed unused variables which were resulting in warnings during
  compilation.
- Added a newline at the end of header files to resolve warnings
  shown during compilation.

AMD-Internal: [CPUPL-3002]
Change-Id: Iab6cf329f6d7fbd7544b5c8837e493069e8c9921
This commit is contained in:
Arnav Sharma
2023-02-21 15:10:19 +05:30
committed by Arnav Sharma
parent 3d4611ab8b
commit 5baf38b76d
6 changed files with 2392 additions and 336 deletions

View File

@@ -61,7 +61,6 @@ void bli_sgemmsup_rd_zen_asm_5x64_avx512
uint64_t k_left1 = k_left32 % 8;
uint64_t m_iter = m0 / 6;
uint64_t m_left = m0 % 6;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -487,7 +486,6 @@ void bli_sgemmsup_rd_zen_asm_4x64_avx512
uint64_t k_left1 = k_left32 % 8;
uint64_t m_iter = m0 / 6;
uint64_t m_left = m0 % 6;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -888,7 +886,6 @@ void bli_sgemmsup_rd_zen_asm_3x64_avx512
uint64_t k_left1 = k_left32 % 8;
uint64_t m_iter = m0 / 6;
uint64_t m_left = m0 % 6;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -1987,7 +1984,6 @@ void bli_sgemmsup_rd_zen_asm_5x48_avx512
uint64_t k_left1 = k_left32 % 8;
uint64_t m_iter = m0 / 6;
uint64_t m_left = m0 % 6;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -2406,7 +2402,6 @@ void bli_sgemmsup_rd_zen_asm_4x48_avx512
uint64_t k_left1 = k_left32 % 8;
uint64_t m_iter = m0 / 6;
uint64_t m_left = m0 % 6;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -2814,7 +2809,6 @@ void bli_sgemmsup_rd_zen_asm_3x48_avx512
uint64_t k_left1 = k_left32 % 8;
uint64_t m_iter = m0 / 6;
uint64_t m_left = m0 % 6;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -3913,7 +3907,6 @@ void bli_sgemmsup_rd_zen_asm_5x32_avx512
uint64_t k_left1 = k_left32 % 8;
uint64_t m_iter = m0 / 6;
uint64_t m_left = m0 % 6;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -4331,7 +4324,6 @@ void bli_sgemmsup_rd_zen_asm_4x32_avx512
uint64_t k_left1 = k_left32 % 8;
uint64_t m_iter = m0 / 6;
uint64_t m_left = m0 % 6;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -4738,7 +4730,6 @@ void bli_sgemmsup_rd_zen_asm_3x32_avx512
uint64_t k_left1 = k_left32 % 8;
uint64_t m_iter = m0 / 6;
uint64_t m_left = m0 % 6;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;

View File

@@ -198,4 +198,4 @@
mov( var( rs_c ), rdi ) \
lea( mem( , rdi, 4 ), rdi ) \
vmovups( xmm4, mem( rcx ) ) \
add( rdi, rcx )
add( rdi, rcx )

View File

@@ -53,7 +53,8 @@ void bli_sgemmsup_rv_zen_asm_5x48_avx512
cntx_t* restrict cntx
)
{
uint64_t k_iter = k0;
uint64_t k_iter = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -80,23 +81,121 @@ void bli_sgemmsup_rv_zen_asm_5x48_avx512
lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) => rs_c *= 4
lea( mem( r8, r8, 2 ), r13 ) // r13 = 3 * rs_a
lea( mem( r8, r8, 4 ), r15 ) // r15 = 5 * rs_a
INIT_REG
mov( var( abuf ), rax ) // load address of a
mov( var( bbuf ), rbx ) // load address of b
mov( var( cbuf ), rcx ) // load address of c
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
mov( var( alpha ), rdx ) // load address of alpha
vbroadcastss( ( rdx ), zmm7 )
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
je( .CONSID_K_LEFT )
// The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
label( .K_LOOP_ITER )
// ITER 0
// Load 3 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
vmovups( 0x80( rbx ), zmm2 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA3( 4, 8, 9, 10 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA3( 5, 12, 13, 14 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA3( 6, 16, 17, 18 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA3( 4, 20, 21, 22 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA3( 5, 24, 25, 26 )
add( r9, rbx )
add( r10, rax )
// ITER 1
// Load 3 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
vmovups( 0x80( rbx ), zmm2 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA3( 4, 8, 9, 10 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA3( 5, 12, 13, 14 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA3( 6, 16, 17, 18 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA3( 4, 20, 21, 22 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA3( 5, 24, 25, 26 )
add( r9, rbx )
add( r10, rax )
// ITER 2
// Load 3 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
vmovups( 0x80( rbx ), zmm2 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA3( 4, 8, 9, 10 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA3( 5, 12, 13, 14 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA3( 6, 16, 17, 18 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA3( 4, 20, 21, 22 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA3( 5, 24, 25, 26 )
add( r9, rbx )
add( r10, rax )
// ITER 3
// Load 3 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
vmovups( 0x80( rbx ), zmm2 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA3( 4, 8, 9, 10 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA3( 5, 12, 13, 14 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA3( 6, 16, 17, 18 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA3( 4, 20, 21, 22 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA3( 5, 24, 25, 26 )
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
label( .CONSID_K_LEFT )
mov( var( k_left ), rsi ) // i = k_left;
test( rsi, rsi ) // check i via logical AND.
je( .SPOSTACCUM ) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
label( .K_LEFT_LOOP )
// Load 3 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
@@ -117,7 +216,9 @@ void bli_sgemmsup_rv_zen_asm_5x48_avx512
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
jne( .K_LEFT_LOOP ) // if rsi != 0, repeat k-loop
label( .SPOSTACCUM )
// Scaling A * B with alpha.
ALPHA_SCALE3( 7, 8, 9, 10 )
@@ -125,7 +226,7 @@ void bli_sgemmsup_rv_zen_asm_5x48_avx512
ALPHA_SCALE3( 7, 16, 17, 18 )
ALPHA_SCALE3( 7, 20, 21, 22 )
ALPHA_SCALE3( 7, 24, 25, 26 )
mov( var( beta ), rdx ) // load address of beta
vbroadcastss( ( rdx ), zmm4 )
@@ -233,6 +334,7 @@ void bli_sgemmsup_rv_zen_asm_5x48_avx512
: // output operands (none)
: // input operands
[k_iter] "m" (k_iter),
[k_left] "m" (k_left),
[a] "m" (a),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),
@@ -278,7 +380,8 @@ void bli_sgemmsup_rv_zen_asm_5x32_avx512
cntx_t* restrict cntx
)
{
uint64_t k_iter = k0;
uint64_t k_iter = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -305,27 +408,28 @@ void bli_sgemmsup_rv_zen_asm_5x32_avx512
lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) => rs_c *= 4
lea( mem( r8, r8, 2 ), r13 ) // r13 = 3 * rs_a
lea( mem( r8, r8, 4 ), r15 ) // r15 = 5 * rs_a
INIT_REG
mov( var( abuf ), rax ) // load address of a
mov( var( bbuf ), rbx ) // load address of b
mov( var( cbuf ), rcx ) // load address of c
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
mov( var( alpha ), rdx ) // load address of alpha
vbroadcastss( ( rdx ), zmm7 )
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
je( .CONSID_K_LEFT )
// The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
label( .K_LOOP_ITER )
// ITER 0
// Load 2 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA2( 4, 8, 9 )
@@ -337,11 +441,107 @@ void bli_sgemmsup_rv_zen_asm_5x32_avx512
VFMA2( 4, 20, 21 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA2( 5, 24, 25 )
add( r9, rbx )
add( r10, rax )
// ITER 1
// Load 2 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA2( 4, 8, 9 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA2( 5, 12, 13 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA2( 6, 16, 17 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA2( 4, 20, 21 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA2( 5, 24, 25 )
add( r9, rbx )
add( r10, rax )
// ITER 2
// Load 2 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA2( 4, 8, 9 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA2( 5, 12, 13 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA2( 6, 16, 17 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA2( 4, 20, 21 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA2( 5, 24, 25 )
add( r9, rbx )
add( r10, rax )
// ITER 3
// Load 2 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA2( 4, 8, 9 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA2( 5, 12, 13 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA2( 6, 16, 17 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA2( 4, 20, 21 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA2( 5, 24, 25 )
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
label( .CONSID_K_LEFT )
mov( var( k_left ), rsi ) // i = k_left;
test( rsi, rsi ) // check i via logical AND.
je( .SPOSTACCUM ) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
label( .K_LEFT_LOOP )
// Load 2 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA2( 4, 8, 9 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA2( 5, 12, 13 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA2( 6, 16, 17 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA2( 4, 20, 21 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA2( 5, 24, 25 )
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
jne( .K_LEFT_LOOP ) // if rsi != 0, repeat k-loop
label( .SPOSTACCUM )
// Scaling A * B with alpha.
ALPHA_SCALE2( 7, 8, 9 )
@@ -349,7 +549,7 @@ void bli_sgemmsup_rv_zen_asm_5x32_avx512
ALPHA_SCALE2( 7, 16, 17 )
ALPHA_SCALE2( 7, 20, 21 )
ALPHA_SCALE2( 7, 24, 25 )
mov( var( beta ), rdx ) // load address of beta
vbroadcastss( ( rdx ), zmm4 )
@@ -448,6 +648,7 @@ void bli_sgemmsup_rv_zen_asm_5x32_avx512
: // output operands (none)
: // input operands
[k_iter] "m" (k_iter),
[k_left] "m" (k_left),
[a] "m" (a),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),
@@ -493,7 +694,8 @@ void bli_sgemmsup_rv_zen_asm_5x16_avx512
cntx_t* restrict cntx
)
{
uint64_t k_iter = k0;
uint64_t k_iter = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -520,23 +722,24 @@ void bli_sgemmsup_rv_zen_asm_5x16_avx512
lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) => rs_c *= 4
lea( mem( r8, r8, 2 ), r13 ) // r13 = 3 * rs_a
lea( mem( r8, r8, 4 ), r15 ) // r15 = 5 * rs_a
INIT_REG
mov( var( abuf ), rax ) // load address of a
mov( var( bbuf ), rbx ) // load address of b
mov( var( cbuf ), rcx ) // load address of c
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
mov( var( alpha ), rdx ) // load address of alpha
vbroadcastss( ( rdx ), zmm7 )
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
je( .CONSID_K_LEFT )
// The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
label( .K_LOOP_ITER )
// ITER 0
// Load 1 row from B matrix.
vmovups( ( rbx ), zmm0 )
@@ -551,11 +754,103 @@ void bli_sgemmsup_rv_zen_asm_5x16_avx512
VFMA1( 4, 20 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA1( 5, 24 )
add( r9, rbx )
add( r10, rax )
// ITER 1
// Load 1 row from B matrix.
vmovups( ( rbx ), zmm0 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA1( 4, 8 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA1( 5, 12 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA1( 6, 16 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA1( 4, 20 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA1( 5, 24 )
add( r9, rbx )
add( r10, rax )
// ITER 2
// Load 1 row from B matrix.
vmovups( ( rbx ), zmm0 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA1( 4, 8 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA1( 5, 12 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA1( 6, 16 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA1( 4, 20 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA1( 5, 24 )
add( r9, rbx )
add( r10, rax )
// ITER 3
// Load 1 row from B matrix.
vmovups( ( rbx ), zmm0 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA1( 4, 8 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA1( 5, 12 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA1( 6, 16 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA1( 4, 20 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA1( 5, 24 )
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
label( .CONSID_K_LEFT )
mov( var( k_left ), rsi ) // i = k_left;
test( rsi, rsi ) // check i via logical AND.
je( .SPOSTACCUM ) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
label( .K_LEFT_LOOP )
// Load 1 row from B matrix.
vmovups( ( rbx ), zmm0 )
// Broadcast 5 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA1( 4, 8 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA1( 5, 12 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA1( 6, 16 )
vbroadcastss( mem( rax, r13, 1 ), zmm4 )
VFMA1( 4, 20 )
vbroadcastss( mem( rax, r8, 4 ), zmm5 )
VFMA1( 5, 24 )
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
jne( .K_LEFT_LOOP ) // if rsi != 0, repeat k-loop
label( .SPOSTACCUM )
// Scaling A * B with alpha.
ALPHA_SCALE1( 7, 8 )
@@ -563,7 +858,7 @@ void bli_sgemmsup_rv_zen_asm_5x16_avx512
ALPHA_SCALE1( 7, 16 )
ALPHA_SCALE1( 7, 20 )
ALPHA_SCALE1( 7, 24 )
mov( var( beta ), rdx ) // load address of beta
vbroadcastss( ( rdx ), zmm4 )
@@ -658,6 +953,7 @@ void bli_sgemmsup_rv_zen_asm_5x16_avx512
: // output operands (none)
: // input operands
[k_iter] "m" (k_iter),
[k_left] "m" (k_left),
[a] "m" (a),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),
@@ -703,7 +999,8 @@ void bli_sgemmsup_rv_zen_asm_3x48_avx512
cntx_t* restrict cntx
)
{
uint64_t k_iter = k0;
uint64_t k_iter = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -730,23 +1027,24 @@ void bli_sgemmsup_rv_zen_asm_3x48_avx512
lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) => rs_c *= 4
lea( mem( r8, r8, 2 ), r13 ) // r13 = 3 * rs_a
lea( mem( r8, r8, 4 ), r15 ) // r15 = 5 * rs_a
INIT_REG
mov( var( abuf ), rax ) // load address of a
mov( var( bbuf ), rbx ) // load address of b
mov( var( cbuf ), rcx ) // load address of c
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
mov( var( alpha ), rdx ) // load address of alpha
vbroadcastss( ( rdx ), zmm7 )
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
je( .CONSID_K_LEFT )
// The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
label( .K_LOOP_ITER )
// ITER 0
// Load 3 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
@@ -759,17 +1057,101 @@ void bli_sgemmsup_rv_zen_asm_3x48_avx512
VFMA3( 5, 12, 13, 14 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA3( 6, 16, 17, 18 )
add( r9, rbx )
add( r10, rax )
// ITER 1
// Load 3 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
vmovups( 0x80( rbx ), zmm2 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA3( 4, 8, 9, 10 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA3( 5, 12, 13, 14 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA3( 6, 16, 17, 18 )
add( r9, rbx )
add( r10, rax )
// ITER 2
// Load 3 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
vmovups( 0x80( rbx ), zmm2 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA3( 4, 8, 9, 10 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA3( 5, 12, 13, 14 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA3( 6, 16, 17, 18 )
add( r9, rbx )
add( r10, rax )
// ITER 3
// Load 3 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
vmovups( 0x80( rbx ), zmm2 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA3( 4, 8, 9, 10 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA3( 5, 12, 13, 14 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA3( 6, 16, 17, 18 )
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
label( .CONSID_K_LEFT )
mov( var( k_left ), rsi ) // i = k_left;
test( rsi, rsi ) // check i via logical AND.
je( .SPOSTACCUM ) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
label( .K_LEFT_LOOP )
// Load 3 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
vmovups( 0x80( rbx ), zmm2 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA3( 4, 8, 9, 10 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA3( 5, 12, 13, 14 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA3( 6, 16, 17, 18 )
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
jne( .K_LEFT_LOOP ) // if rsi != 0, repeat k-loop
label( .SPOSTACCUM )
// Scaling A * B with alpha.
ALPHA_SCALE3( 7, 8, 9, 10 )
ALPHA_SCALE3( 7, 12, 13, 14 )
ALPHA_SCALE3( 7, 16, 17, 18 )
mov( var( beta ), rdx ) // load address of beta
vbroadcastss( ( rdx ), zmm4 )
@@ -871,6 +1253,7 @@ void bli_sgemmsup_rv_zen_asm_3x48_avx512
: // output operands (none)
: // input operands
[k_iter] "m" (k_iter),
[k_left] "m" (k_left),
[a] "m" (a),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),
@@ -916,7 +1299,8 @@ void bli_sgemmsup_rv_zen_asm_3x32_avx512
cntx_t* restrict cntx
)
{
uint64_t k_iter = k0;
uint64_t k_iter = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -943,23 +1327,24 @@ void bli_sgemmsup_rv_zen_asm_3x32_avx512
lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) => rs_c *= 4
lea( mem( r8, r8, 2 ), r13 ) // r13 = 3 * rs_a
lea( mem( r8, r8, 4 ), r15 ) // r15 = 5 * rs_a
INIT_REG
mov( var( abuf ), rax ) // load address of a
mov( var( bbuf ), rbx ) // load address of b
mov( var( cbuf ), rcx ) // load address of c
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
mov( var( alpha ), rdx ) // load address of alpha
vbroadcastss( ( rdx ), zmm7 )
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
je( .CONSID_K_LEFT )
// The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
label( .K_LOOP_ITER )
// ITER 0
// Load 2 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
@@ -971,17 +1356,97 @@ void bli_sgemmsup_rv_zen_asm_3x32_avx512
VFMA2( 5, 12, 13 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA2( 6, 16, 17 )
add( r9, rbx )
add( r10, rax )
// ITER 1
// Load 2 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA2( 4, 8, 9 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA2( 5, 12, 13 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA2( 6, 16, 17 )
add( r9, rbx )
add( r10, rax )
// ITER 2
// Load 2 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA2( 4, 8, 9 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA2( 5, 12, 13 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA2( 6, 16, 17 )
add( r9, rbx )
add( r10, rax )
// ITER 3
// Load 2 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA2( 4, 8, 9 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA2( 5, 12, 13 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA2( 6, 16, 17 )
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
label( .CONSID_K_LEFT )
mov( var( k_left ), rsi ) // i = k_left;
test( rsi, rsi ) // check i via logical AND.
je( .SPOSTACCUM ) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
label( .K_LEFT_LOOP )
// Load 2 rows from B matrix.
vmovups( ( rbx ), zmm0 )
vmovups( 0x40( rbx ), zmm1 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA2( 4, 8, 9 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA2( 5, 12, 13 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA2( 6, 16, 17 )
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
jne( .K_LEFT_LOOP ) // if rsi != 0, repeat k-loop
label( .SPOSTACCUM )
// Scaling A * B with alpha.
ALPHA_SCALE2( 7, 8, 9 )
ALPHA_SCALE2( 7, 12, 13 )
ALPHA_SCALE2( 7, 16, 17 )
mov( var( beta ), rdx ) // load address of beta
vbroadcastss( ( rdx ), zmm4 )
@@ -1064,7 +1529,7 @@ void bli_sgemmsup_rv_zen_asm_3x32_avx512
mov( var( cs_c ), rdi ) // load cs_c
lea( mem( , rdi, 4 ), rdi ) // rdi = cs_c *= sizeof(dt) => cs_c *= 4
lea( mem( rdi, rdi, 2 ), r12 )
UPDATE_C_1X16_BZ( 16 )
UPDATE_C_1X16_BZ( 17 )
@@ -1077,6 +1542,7 @@ void bli_sgemmsup_rv_zen_asm_3x32_avx512
: // output operands (none)
: // input operands
[k_iter] "m" (k_iter),
[k_left] "m" (k_left),
[a] "m" (a),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),
@@ -1122,7 +1588,8 @@ void bli_sgemmsup_rv_zen_asm_3x16_avx512
cntx_t* restrict cntx
)
{
uint64_t k_iter = k0;
uint64_t k_iter = k0 / 4;
uint64_t k_left = k0 % 4;
uint64_t rs_a = rs_a0;
uint64_t cs_a = cs_a0;
@@ -1149,23 +1616,24 @@ void bli_sgemmsup_rv_zen_asm_3x16_avx512
lea( mem( , rdi, 4 ), rdi ) // rs_c *= sizeof(float) => rs_c *= 4
lea( mem( r8, r8, 2 ), r13 ) // r13 = 3 * rs_a
lea( mem( r8, r8, 4 ), r15 ) // r15 = 5 * rs_a
INIT_REG
mov( var( abuf ), rax ) // load address of a
mov( var( bbuf ), rbx ) // load address of b
mov( var( cbuf ), rcx ) // load address of c
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
mov( var( alpha ), rdx ) // load address of alpha
vbroadcastss( ( rdx ), zmm7 )
mov( var( k_iter ), rsi ) // load k_iter
test( rsi, rsi )
je( .CONSID_K_LEFT )
// The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
label( .K_LOOP_ITER )
// ITER 0
// Load 1 row from B matrix.
vmovups( ( rbx ), zmm0 )
@@ -1176,17 +1644,93 @@ void bli_sgemmsup_rv_zen_asm_3x16_avx512
VFMA1( 5, 12 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA1( 6, 16 )
add( r9, rbx )
add( r10, rax )
// ITER 1
// Load 1 row from B matrix.
vmovups( ( rbx ), zmm0 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA1( 4, 8 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA1( 5, 12 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA1( 6, 16 )
add( r9, rbx )
add( r10, rax )
// ITER 2
// Load 1 row from B matrix.
vmovups( ( rbx ), zmm0 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA1( 4, 8 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA1( 5, 12 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA1( 6, 16 )
add( r9, rbx )
add( r10, rax )
// ITER 3
// Load 1 row from B matrix.
vmovups( ( rbx ), zmm0 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA1( 4, 8 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA1( 5, 12 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA1( 6, 16 )
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
label( .CONSID_K_LEFT )
mov( var( k_left ), rsi ) // i = k_left;
test( rsi, rsi ) // check i via logical AND.
je( .SPOSTACCUM ) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
label( .K_LEFT_LOOP )
// Load 1 row from B matrix.
vmovups( ( rbx ), zmm0 )
// Broadcast 3 elements from a row of A & do VFMA with rows of B.
vbroadcastss( ( rax ), zmm4 )
VFMA1( 4, 8 )
vbroadcastss( mem( rax, r8, 1 ), zmm5 )
VFMA1( 5, 12 )
vbroadcastss( mem( rax, r8, 2 ), zmm6 )
VFMA1( 6, 16 )
add( r9, rbx )
add( r10, rax )
dec( rsi )
jne( .K_LOOP_ITER ) // if rsi != 0, repeat k-loop
jne( .K_LEFT_LOOP ) // if rsi != 0, repeat k-loop
label( .SPOSTACCUM )
// Scaling A * B with alpha.
ALPHA_SCALE1( 7, 8 )
ALPHA_SCALE1( 7, 12 )
ALPHA_SCALE1( 7, 16 )
mov( var( beta ), rdx ) // load address of beta
vbroadcastss( ( rdx ), zmm4 )
@@ -1271,11 +1815,12 @@ void bli_sgemmsup_rv_zen_asm_3x16_avx512
label( .SDONE )
end_asm(
: // output operands (none)
: // input operands
[k_iter] "m" (k_iter),
[k_left] "m" (k_left),
[a] "m" (a),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),

View File

@@ -354,4 +354,4 @@
vmovss( xmm6, (rcx, rdi, 1) ) \
vmovss( xmm7, (rcx, rdi, 2) ) \
vmovss( xmm12, (rcx, r12, 1) ) \
lea( (rcx, rdi, 4), rcx )
lea( (rcx, rdi, 4), rcx )

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff