mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Merge pull request #226 from devinamatthews/dev
Finish macroization of assembly ukernels.
This commit is contained in:
@@ -98,7 +98,7 @@
|
||||
#define COMMENT_BEGIN "#"
|
||||
#define COMMENT_END
|
||||
|
||||
#define BEGIN_ASM __asm__ volatile (
|
||||
#define BEGIN_ASM() __asm__ volatile (
|
||||
#define END_ASM(...) __VA_ARGS__ );
|
||||
|
||||
|
||||
@@ -149,8 +149,8 @@
|
||||
|
||||
#endif
|
||||
|
||||
#define begin_asm BEGIN_ASM
|
||||
#define end_asm END_ASM
|
||||
#define begin_asm() BEGIN_ASM()
|
||||
#define end_asm(...) END_ASM(__VA_ARGS__)
|
||||
|
||||
#define label(...) LABEL(__VA_ARGS__)
|
||||
#define imm(...) IMM(__VA_ARGS__)
|
||||
|
||||
@@ -107,18 +107,17 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
|
||||
vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
|
||||
vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b.
|
||||
vpermilps(imm(0x4e), ymm2, ymm3)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
|
||||
lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c;
|
||||
|
||||
@@ -142,7 +141,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
|
||||
vxorps(ymm15, ymm15, ymm15)
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -251,7 +250,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
|
||||
|
||||
label(.SCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -348,8 +347,8 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
|
||||
// ab61 ab63 ab65 ab67
|
||||
// ab71 ) ab73 ) ab75 ) ab77 )
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
|
||||
vbroadcastss(mem(rbx), ymm4) // load beta and duplicate
|
||||
|
||||
@@ -365,7 +364,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
|
||||
|
||||
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
|
||||
@@ -740,19 +739,20 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
|
||||
label(.SDONE)
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -761,7 +761,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
#undef KERNEL4x6_1
|
||||
@@ -879,13 +879,12 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
vzeroall()
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
mov(var(a), rax) // load address of a.
|
||||
prefetch(0, mem(rax, 64))
|
||||
|
||||
|
||||
@@ -895,7 +894,7 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
|
||||
add(imm(12*8), rbx)
|
||||
add(imm(8*8), rax)
|
||||
|
||||
mov(%0, rsi) // i = k_iter; notice %0 not $0
|
||||
mov(var(k_iter), rsi) // i = k_iter; notice var(k_iter) not $0
|
||||
test(rsi, rsi)
|
||||
je(.CONSIDERKLEFT)
|
||||
|
||||
@@ -920,7 +919,7 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
|
||||
|
||||
label(.CONSIDERKLEFT)
|
||||
|
||||
mov(%1, rsi)
|
||||
mov(var(k_left), rsi)
|
||||
test(rsi, rsi)
|
||||
label(.LOOPKLEFT)
|
||||
je(.POSTACCUM)
|
||||
@@ -935,11 +934,11 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
|
||||
label(.POSTACCUM)
|
||||
|
||||
|
||||
mov(%7, rsi) // load cs_c
|
||||
mov(%8, rdi) // load rs_c
|
||||
vmovddup(mem(%4), xmm2) //load alpha
|
||||
vmovddup(mem(%5), xmm3) //load beta
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(var(rs_c), rsi) // load cs_c
|
||||
mov(var(cs_c), rdi) // load rs_c
|
||||
vmovddup(mem(var(alpha)), xmm2) //load alpha
|
||||
vmovddup(mem(var(beta)), xmm3) //load beta
|
||||
mov(var(c), rcx) // load address of c
|
||||
sal(imm(3), rsi) // cs_c *= sizeof(double)
|
||||
sal(imm(3), rdi) // rs_c *= sizeof(double)
|
||||
lea(mem(rcx, rdi, 2), rdx)
|
||||
@@ -1034,17 +1033,20 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
|
||||
vmovhpd(xmm14, mem(rdx, rdi, 1))
|
||||
vmovhpd(xmm15, mem(r8, rdi, 1))
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"r" (k_iter),
|
||||
"r" (k_left),
|
||||
"r" (a),
|
||||
"r" (b),
|
||||
"r" (alpha),
|
||||
"r" (beta),
|
||||
"r" (c),
|
||||
"m" (rs_c),
|
||||
"m" (cs_c)
|
||||
[k_iter] "r" (k_iter), // 0
|
||||
[k_left] "r" (k_left), // 1
|
||||
[a] "r" (a), // 2
|
||||
[b] "r" (b), // 3
|
||||
[alpha] "r" (alpha), // 4
|
||||
[beta] "r" (beta), // 5
|
||||
[c] "r" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
@@ -1052,7 +1054,7 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
//The parameter "i" is the iteration number, i.e. the B values to read
|
||||
#define MADD_TO_YMM(i) \
|
||||
@@ -1094,21 +1096,20 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(%9, r15) // load address of b_next.
|
||||
//mov(%10, r14) // load address of a_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
mov(var(b_next), r15) // load address of b_next.
|
||||
//mov(var(a_next), r14) // load address of a_next.
|
||||
sub(imm(4*64), r15)
|
||||
|
||||
vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
|
||||
vmovsldup(mem(rbx, 0*32), ymm2)
|
||||
vpermilps(imm(0x4e), ymm2, ymm3)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
|
||||
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
|
||||
|
||||
@@ -1126,7 +1127,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
|
||||
vxorps(ymm14, ymm14, ymm14)
|
||||
vxorps(ymm15, ymm15, ymm15)
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -1266,7 +1267,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
|
||||
|
||||
label(.CCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -1377,7 +1378,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
|
||||
|
||||
// scale by alpha
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate
|
||||
vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate
|
||||
|
||||
@@ -1424,7 +1425,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate
|
||||
vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate
|
||||
|
||||
@@ -1434,7 +1435,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
|
||||
|
||||
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
|
||||
@@ -1835,19 +1836,20 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
|
||||
label(.CDONE)
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (b_next)/*, // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c), // 8
|
||||
[b_next] "m" (b_next)/*, // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -1856,7 +1858,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
|
||||
"ymm8", "ymm9", "ymm10", "ymm11",
|
||||
"ymm12", "ymm13", "ymm14", "ymm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
#define MADDSUBPD_TO_YMM \
|
||||
@@ -1905,21 +1907,20 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
//mov(%9, r15) // load address of b_next.
|
||||
//mov(%10, r14) // load address of a_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
//mov(var(b_next), r15) // load address of b_next.
|
||||
//mov(var(a_next), r14) // load address of a_next.
|
||||
|
||||
vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
|
||||
vmovddup(mem(rbx, 0+0*32), ymm2)
|
||||
vmovddup(mem(rbx, 0+1*32), ymm3)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
|
||||
lea(mem(, rdi, 2), rdi)
|
||||
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
|
||||
@@ -1939,7 +1940,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
|
||||
vxorpd(ymm15, ymm15, ymm15)
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.ZCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -2083,7 +2084,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
|
||||
|
||||
label(.ZCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -2176,7 +2177,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
|
||||
|
||||
// scale by alpha
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate
|
||||
vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate
|
||||
|
||||
@@ -2190,13 +2191,13 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
|
||||
Z_ALPHA(9, 1)
|
||||
Z_ALPHA(8, 0)
|
||||
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate
|
||||
vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
|
||||
lea(mem(, rsi, 2), rsi)
|
||||
lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c;
|
||||
@@ -2508,19 +2509,20 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
|
||||
label(.ZDONE)
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -2529,6 +2531,6 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
|
||||
"ymm8", "ymm9", "ymm10", "ymm11",
|
||||
"ymm12", "ymm13", "ymm14", "ymm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -127,7 +127,7 @@ void bli_dpackm_knl_asm_8xk
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
|
||||
BEGIN_ASM
|
||||
BEGIN_ASM()
|
||||
|
||||
MOV(RSI, VAR(n))
|
||||
MOV(RAX, VAR(a))
|
||||
@@ -321,8 +321,8 @@ void bli_dpackm_knl_asm_24xk
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
BEGIN_ASM()
|
||||
|
||||
MOV(RSI, VAR(n))
|
||||
MOV(RAX, VAR(a))
|
||||
MOV(RBX, VAR(inca))
|
||||
@@ -524,6 +524,7 @@ void bli_dpackm_knl_asm_24xk
|
||||
|
||||
LABEL(PACK24_DONE)
|
||||
|
||||
END_ASM(
|
||||
: //output operands
|
||||
: //input operands
|
||||
[n] "m" (n),
|
||||
@@ -543,5 +544,5 @@ void bli_dpackm_knl_asm_24xk
|
||||
"zmm30", "zmm31",
|
||||
"rax", "rbx", "rcx", "rdi", "rsi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
@@ -129,8 +129,8 @@ void bli_spackm_knl_asm_16xk
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
BEGIN_ASM()
|
||||
|
||||
MOV(RSI, VAR(n))
|
||||
MOV(RAX, VAR(a))
|
||||
MOV(RBX, VAR(inca))
|
||||
@@ -295,6 +295,7 @@ void bli_spackm_knl_asm_16xk
|
||||
|
||||
LABEL(PACK16_DONE)
|
||||
|
||||
END_ASM(
|
||||
: //output operands
|
||||
: //input operands
|
||||
[n] "m" (n),
|
||||
@@ -314,7 +315,7 @@ void bli_spackm_knl_asm_16xk
|
||||
"zmm30", "zmm31",
|
||||
"rax", "rbx", "rcx", "rdx", "rdi", "rsi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
void bli_spackm_knl_asm_24xk
|
||||
@@ -338,8 +339,8 @@ void bli_spackm_knl_asm_24xk
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
BEGIN_ASM()
|
||||
|
||||
MOV(RSI, VAR(n))
|
||||
MOV(RAX, VAR(a))
|
||||
MOV(RBX, VAR(inca))
|
||||
@@ -540,6 +541,7 @@ void bli_spackm_knl_asm_24xk
|
||||
|
||||
LABEL(PACK24_DONE)
|
||||
|
||||
END_ASM(
|
||||
: //output operands
|
||||
: //input operands
|
||||
[n] "m" (n),
|
||||
@@ -559,5 +561,5 @@ void bli_spackm_knl_asm_24xk
|
||||
"zmm30", "zmm31",
|
||||
"rax", "rbx", "rcx", "rdx", "rdi", "rsi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
@@ -213,7 +213,7 @@ void bli_dgemm_knl_asm_24x8
|
||||
int tlooph, tloopl, blooph, bloopl;
|
||||
#endif
|
||||
|
||||
BEGIN_ASM
|
||||
BEGIN_ASM()
|
||||
|
||||
#ifdef MONITORS
|
||||
RDTSC
|
||||
|
||||
@@ -210,8 +210,8 @@ void bli_sgemm_knl_asm_24x16
|
||||
int tlooph, tloopl, blooph, bloopl;
|
||||
#endif
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
BEGIN_ASM()
|
||||
|
||||
#ifdef MONITORS
|
||||
RDTSC
|
||||
MOV(VAR(topl), EAX)
|
||||
@@ -664,6 +664,8 @@ void bli_sgemm_knl_asm_24x16
|
||||
MOV(VAR(botl), EAX)
|
||||
MOV(VAR(both), EDX)
|
||||
#endif
|
||||
|
||||
END_ASM(
|
||||
: // output operands
|
||||
#ifdef MONITORS
|
||||
[topl] "=m" (topl),
|
||||
@@ -694,7 +696,7 @@ void bli_sgemm_knl_asm_24x16
|
||||
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
|
||||
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
|
||||
"zmm30", "zmm31", "memory"
|
||||
);
|
||||
)
|
||||
|
||||
#ifdef LOOPMON
|
||||
printf("looptime = \t%d\n", bloopl - tloopl);
|
||||
|
||||
@@ -59,13 +59,12 @@ void bli_sgemm_penryn_asm_8x4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(%9, r9) // load address of b_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
mov(var(b_next), r9) // load address of b_next.
|
||||
|
||||
sub(imm(0-8*16), rax) // increment pointers to allow byte
|
||||
sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
|
||||
@@ -74,8 +73,8 @@ void bli_sgemm_penryn_asm_8x4
|
||||
movaps(mem(rax, -7*16), xmm1) // of a and b.
|
||||
movaps(mem(rbx, -8*16), xmm2)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
|
||||
mov(rdi, r12) // make a copy of cs_c (in bytes)
|
||||
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
|
||||
@@ -102,7 +101,7 @@ void bli_sgemm_penryn_asm_8x4
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -252,7 +251,7 @@ void bli_sgemm_penryn_asm_8x4
|
||||
|
||||
label(.SCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -307,15 +306,15 @@ void bli_sgemm_penryn_asm_8x4
|
||||
addps(xmm5, xmm15)
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
movss(mem(rax), xmm6) // load alpha to bottom 4 bytes of xmm6
|
||||
movss(mem(rbx), xmm7) // load beta to bottom 4 bytes of xmm7
|
||||
pshufd(imm(0x00), xmm6, xmm6) // populate xmm6 with four alphas
|
||||
pshufd(imm(0x00), xmm7, xmm7) // populate xmm7 with four betas
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
mov(rsi, r8) // make a copy of rs_c
|
||||
|
||||
lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
|
||||
@@ -821,18 +820,20 @@ void bli_sgemm_penryn_asm_8x4
|
||||
label(.SDONE)
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter),
|
||||
"m" (k_left),
|
||||
"m" (a),
|
||||
"m" (b),
|
||||
"m" (alpha),
|
||||
"m" (beta),
|
||||
"m" (c),
|
||||
"m" (rs_c),
|
||||
"m" (cs_c),
|
||||
"m" (b_next)
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c), // 8
|
||||
[b_next] "m" (b_next)/*, // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
@@ -840,7 +841,7 @@ void bli_sgemm_penryn_asm_8x4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
void bli_dgemm_penryn_asm_4x4
|
||||
@@ -865,14 +866,13 @@ void bli_dgemm_penryn_asm_4x4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(%9, r9) // load address of b_next.
|
||||
mov(%10, r11) // load address of a_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
mov(var(b_next), r9) // load address of b_next.
|
||||
mov(var(a_next), r11) // load address of a_next.
|
||||
|
||||
sub(imm(0-8*16), rax) // increment pointers to allow byte
|
||||
sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
|
||||
@@ -881,8 +881,8 @@ void bli_dgemm_penryn_asm_4x4
|
||||
movaps(mem(rax, -7*16), xmm1) // of a and b.
|
||||
movaps(mem(rbx, -8*16), xmm2)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
|
||||
mov(rdi, r12) // make a copy of cs_c (in bytes)
|
||||
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
|
||||
@@ -909,7 +909,7 @@ void bli_dgemm_penryn_asm_4x4
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -1076,7 +1076,7 @@ void bli_dgemm_penryn_asm_4x4
|
||||
|
||||
label(.DCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -1132,13 +1132,13 @@ void bli_dgemm_penryn_asm_4x4
|
||||
addpd(xmm6, xmm14)
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
movddup(mem(rax), xmm6) // load alpha and duplicate
|
||||
movddup(mem(rbx), xmm7) // load beta and duplicate
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
mov(rsi, r8) // make a copy of rs_c
|
||||
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
|
||||
@@ -1467,19 +1467,20 @@ void bli_dgemm_penryn_asm_4x4
|
||||
label(.DDONE)
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next) // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c), // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
@@ -1487,7 +1488,7 @@ void bli_dgemm_penryn_asm_4x4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -76,12 +76,11 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
mov(%2, rax) // load address of a10.
|
||||
mov(%4, rbx) // load address of b01.
|
||||
//mov(%10, r9) // load address of b_next.
|
||||
mov(var(a10), rax) // load address of a10.
|
||||
mov(var(b01), rbx) // load address of b01.
|
||||
//mov(var(b_next), r9) // load address of b_next.
|
||||
|
||||
sub(imm(0-8*16), rax) // increment pointers to allow byte
|
||||
sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
|
||||
@@ -90,8 +89,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
|
||||
movaps(mem(rax, -7*16), xmm1) // of a and b.
|
||||
movaps(mem(rbx, -8*16), xmm2)
|
||||
|
||||
//mov(%6, rcx) // load address of c11
|
||||
//mov(%9, rdi) // load cs_c
|
||||
//mov(var(c11), rcx) // load address of c11
|
||||
//mov(var(rs_c), rdi) // load cs_c
|
||||
//lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
|
||||
//lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*cs_c;
|
||||
|
||||
@@ -117,7 +116,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CONSIDERKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -270,7 +269,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
|
||||
|
||||
label(.CONSIDERKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.POSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -327,7 +326,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of b11.
|
||||
mov(var(b11), rbx) // load address of b11.
|
||||
|
||||
// xmm8: xmm9: xmm10: xmm11:
|
||||
// ( ab01 ( ab00 ( ab03 ( ab02
|
||||
@@ -361,7 +360,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
|
||||
// xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
|
||||
// xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
|
||||
|
||||
mov(%9, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
movddup(mem(rax), xmm15) // load alpha and duplicate
|
||||
|
||||
movaps(mem(rbx, 0*16), xmm8)
|
||||
@@ -400,11 +399,11 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
|
||||
label(.TRSM)
|
||||
|
||||
|
||||
mov(%3, rax) // load address of a11
|
||||
mov(%6, rcx) // load address of c11
|
||||
mov(var(a11), rax) // load address of a11
|
||||
mov(var(c11), rcx) // load address of c11
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
sal(imm(3), rsi) // rs_c *= sizeof( double )
|
||||
sal(imm(3), rdi) // cs_c *= sizeof( double )
|
||||
|
||||
@@ -519,19 +518,20 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a10), // 2
|
||||
"m" (a11), // 3
|
||||
"m" (b01), // 4
|
||||
"m" (b11), // 5
|
||||
"m" (c11), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (alpha), // 9
|
||||
"m" (b_next) // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a10] "m" (a10), // 2
|
||||
[a11] "m" (a11), // 3
|
||||
[b01] "m" (b01), // 4
|
||||
[b11] "m" (b11), // 5
|
||||
[c11] "m" (c11), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c), // 8
|
||||
[alpha] "m" (alpha), // 9
|
||||
[b_next] "m" (b_next) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
@@ -539,7 +539,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -76,12 +76,11 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
mov(%2, rax) // load address of a12.
|
||||
mov(%4, rbx) // load address of b21.
|
||||
//mov(%10, r9) // load address of b_next.
|
||||
mov(var(a12), rax) // load address of a12.
|
||||
mov(var(b21), rbx) // load address of b21.
|
||||
//mov(var(b_next), r9) // load address of b_next.
|
||||
|
||||
add(imm(8*16), rax) // increment pointers to allow byte
|
||||
add(imm(8*16), rbx) // offsets in the unrolled iterations.
|
||||
@@ -106,7 +105,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CONSIDERKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -253,7 +252,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
|
||||
|
||||
label(.CONSIDERKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.POSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -310,7 +309,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of b11.
|
||||
mov(var(b11), rbx) // load address of b11.
|
||||
|
||||
// xmm8: xmm9: xmm10: xmm11:
|
||||
// ( ab01 ( ab00 ( ab03 ( ab02
|
||||
@@ -344,7 +343,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
|
||||
// xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
|
||||
// xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
|
||||
|
||||
mov(%9, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
movddup(mem(rax), xmm15) // load alpha and duplicate
|
||||
|
||||
movaps(mem(rbx, 0*16), xmm8)
|
||||
@@ -383,11 +382,11 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
|
||||
label(.TRSM)
|
||||
|
||||
|
||||
mov(%3, rax) // load address of a11
|
||||
mov(%6, rcx) // load address of c11
|
||||
mov(var(a11), rax) // load address of a11
|
||||
mov(var(c11), rcx) // load address of c11
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
sal(imm(3), rsi) // rs_c *= sizeof( double )
|
||||
sal(imm(3), rdi) // cs_c *= sizeof( double )
|
||||
|
||||
@@ -504,20 +503,20 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
|
||||
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a12), // 2
|
||||
"m" (a11), // 3
|
||||
"m" (b21), // 4
|
||||
"m" (b11), // 5
|
||||
"m" (c11), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (alpha), // 9
|
||||
"m" (b_next) // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a12] "m" (a12), // 2
|
||||
[a11] "m" (a11), // 3
|
||||
[b21] "m" (b21), // 4
|
||||
[b11] "m" (b11), // 5
|
||||
[c11] "m" (c11), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c), // 8
|
||||
[alpha] "m" (alpha), // 9
|
||||
[b_next] "m" (b_next) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
@@ -525,7 +524,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -64,10 +64,9 @@ void bli_dtrsm_l_penryn_asm_4x4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
mov(%1, rbx) // load address of b11.
|
||||
mov(var(b11), rbx) // load address of b11.
|
||||
|
||||
movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 )
|
||||
movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 )
|
||||
@@ -80,11 +79,11 @@ void bli_dtrsm_l_penryn_asm_4x4
|
||||
|
||||
|
||||
|
||||
mov(%0, rax) // load address of a11
|
||||
mov(%2, rcx) // load address of c11
|
||||
mov(var(a11), rax) // load address of a11
|
||||
mov(var(c11), rcx) // load address of c11
|
||||
|
||||
mov(%3, rsi) // load rs_c
|
||||
mov(%4, rdi) // load cs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
sal(imm(3), rsi) // rs_c *= sizeof( double )
|
||||
sal(imm(3), rdi) // cs_c *= sizeof( double )
|
||||
|
||||
@@ -199,13 +198,14 @@ void bli_dtrsm_l_penryn_asm_4x4
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (a11), // 0
|
||||
"m" (b11), // 1
|
||||
"m" (c11), // 2
|
||||
"m" (rs_c), // 3
|
||||
"m" (cs_c) // 4
|
||||
[a11] "m" (a11), // 0
|
||||
[b11] "m" (b11), // 1
|
||||
[c11] "m" (c11), // 2
|
||||
[rs_c] "m" (rs_c), // 3
|
||||
[cs_c] "m" (cs_c) // 4
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
@@ -213,7 +213,7 @@ void bli_dtrsm_l_penryn_asm_4x4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -64,10 +64,9 @@ void bli_dtrsm_u_penryn_asm_4x4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
mov(%1, rbx) // load address of b11.
|
||||
mov(var(b11), rbx) // load address of b11.
|
||||
|
||||
movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 )
|
||||
movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 )
|
||||
@@ -80,11 +79,11 @@ void bli_dtrsm_u_penryn_asm_4x4
|
||||
|
||||
|
||||
|
||||
mov(%0, rax) // load address of a11
|
||||
mov(%2, rcx) // load address of c11
|
||||
mov(var(a11), rax) // load address of a11
|
||||
mov(var(c11), rcx) // load address of c11
|
||||
|
||||
mov(%3, rsi) // load rs_c
|
||||
mov(%4, rdi) // load cs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
sal(imm(3), rsi) // rs_c *= sizeof( double )
|
||||
sal(imm(3), rdi) // cs_c *= sizeof( double )
|
||||
|
||||
@@ -202,13 +201,14 @@ void bli_dtrsm_u_penryn_asm_4x4
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (a11), // 0
|
||||
"m" (b11), // 1
|
||||
"m" (c11), // 2
|
||||
"m" (rs_c), // 3
|
||||
"m" (cs_c) // 4
|
||||
[a11] "m" (a11), // 0
|
||||
[b11] "m" (b11), // 1
|
||||
[c11] "m" (c11), // 2
|
||||
[rs_c] "m" (rs_c), // 3
|
||||
[cs_c] "m" (cs_c) // 4
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
@@ -216,7 +216,7 @@ void bli_dtrsm_u_penryn_asm_4x4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -62,14 +62,13 @@ void bli_sgemm_piledriver_asm_16x3
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(%9, r15) // load address of b_next.
|
||||
mov(%10, r14) // load address of a_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
mov(var(b_next), r15) // load address of b_next.
|
||||
mov(var(a_next), r14) // load address of a_next.
|
||||
|
||||
prefetch(0, mem(rbx, 128)) // prefetch b
|
||||
prefetch(0, mem(rbx, 64+128)) // prefetch b
|
||||
@@ -78,8 +77,8 @@ void bli_sgemm_piledriver_asm_16x3
|
||||
add(imm(32*4), rax)
|
||||
add(imm(12*4), rbx)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
|
||||
lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
|
||||
lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c;
|
||||
@@ -103,7 +102,7 @@ void bli_sgemm_piledriver_asm_16x3
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -313,7 +312,7 @@ void bli_sgemm_piledriver_asm_16x3
|
||||
|
||||
label(.SCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -393,8 +392,8 @@ void bli_sgemm_piledriver_asm_16x3
|
||||
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastss(mem(rax), xmm0) // load alpha and duplicate
|
||||
vbroadcastss(mem(rbx), xmm2) // load beta and duplicate
|
||||
|
||||
@@ -419,7 +418,7 @@ void bli_sgemm_piledriver_asm_16x3
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
|
||||
|
||||
//lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
|
||||
@@ -887,19 +886,20 @@ void bli_sgemm_piledriver_asm_16x3
|
||||
label(.SDONE)
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next) // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c), // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -908,7 +908,7 @@ void bli_sgemm_piledriver_asm_16x3
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
void bli_dgemm_piledriver_asm_8x3
|
||||
@@ -933,14 +933,13 @@ void bli_dgemm_piledriver_asm_8x3
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(%9, r15) // load address of b_next.
|
||||
mov(%10, r14) // load address of a_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
mov(var(b_next), r15) // load address of b_next.
|
||||
mov(var(a_next), r14) // load address of a_next.
|
||||
|
||||
prefetch(0, mem(rbx, 128)) // prefetch b
|
||||
prefetch(0, mem(rbx, 64+128)) // prefetch b
|
||||
@@ -949,8 +948,8 @@ void bli_dgemm_piledriver_asm_8x3
|
||||
add(imm(16*8), rax)
|
||||
add(imm(12*8), rbx)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
|
||||
lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
|
||||
lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c;
|
||||
@@ -974,7 +973,7 @@ void bli_dgemm_piledriver_asm_8x3
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -1183,7 +1182,7 @@ void bli_dgemm_piledriver_asm_8x3
|
||||
|
||||
label(.DCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DPOSTACCUM) // if i == 0, we're done.
|
||||
// else, we prepare to
|
||||
@@ -1253,8 +1252,8 @@ void bli_dgemm_piledriver_asm_8x3
|
||||
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vmovddup(mem(rax), xmm0) // load alpha and duplicate
|
||||
vmovddup(mem(rbx), xmm2) // load beta and duplicate
|
||||
|
||||
@@ -1278,7 +1277,7 @@ void bli_dgemm_piledriver_asm_8x3
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
|
||||
|
||||
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
|
||||
@@ -1606,19 +1605,20 @@ void bli_dgemm_piledriver_asm_8x3
|
||||
label(.DDONE)
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next) // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c), // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -1627,7 +1627,7 @@ void bli_dgemm_piledriver_asm_8x3
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
void bli_cgemm_piledriver_asm_4x2
|
||||
@@ -1652,17 +1652,16 @@ void bli_cgemm_piledriver_asm_4x2
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(%9, r15) // load address of b_next.
|
||||
mov(%10, r14) // load address of a_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
mov(var(b_next), r15) // load address of b_next.
|
||||
mov(var(a_next), r14) // load address of a_next.
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
|
||||
lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
|
||||
|
||||
@@ -1682,7 +1681,7 @@ void bli_cgemm_piledriver_asm_4x2
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -1848,7 +1847,7 @@ void bli_cgemm_piledriver_asm_4x2
|
||||
|
||||
label(.CCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -1925,7 +1924,7 @@ void bli_cgemm_piledriver_asm_4x2
|
||||
|
||||
// scale by alpha
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
vbroadcastss(mem(rax), xmm0) // load alpha_r and duplicate
|
||||
vbroadcastss(mem(rax, 4), xmm1) // load alpha_i and duplicate
|
||||
|
||||
@@ -1952,7 +1951,7 @@ void bli_cgemm_piledriver_asm_4x2
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastss(mem(rbx), xmm6) // load beta_r and duplicate
|
||||
vbroadcastss(mem(rbx, 4), xmm7) // load beta_i and duplicate
|
||||
|
||||
@@ -1962,7 +1961,7 @@ void bli_cgemm_piledriver_asm_4x2
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
|
||||
|
||||
|
||||
@@ -2154,19 +2153,20 @@ void bli_cgemm_piledriver_asm_4x2
|
||||
label(.CDONE)
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next) // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c), // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -2175,7 +2175,7 @@ void bli_cgemm_piledriver_asm_4x2
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
void bli_zgemm_piledriver_asm_2x2
|
||||
@@ -2200,17 +2200,16 @@ void bli_zgemm_piledriver_asm_2x2
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(%9, r15) // load address of b_next.
|
||||
mov(%10, r14) // load address of a_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
mov(var(b_next), r15) // load address of b_next.
|
||||
mov(var(a_next), r14) // load address of a_next.
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
|
||||
lea(mem(, rdi, 2), rdi)
|
||||
lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
|
||||
@@ -2229,7 +2228,7 @@ void bli_zgemm_piledriver_asm_2x2
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.ZCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -2399,7 +2398,7 @@ void bli_zgemm_piledriver_asm_2x2
|
||||
|
||||
label(.ZCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -2473,7 +2472,7 @@ void bli_zgemm_piledriver_asm_2x2
|
||||
|
||||
// scale by alpha
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
vmovddup(mem(rax), xmm0) // load alpha_r and duplicate
|
||||
vmovddup(mem(rax, 8), xmm1) // load alpha_i and duplicate
|
||||
|
||||
@@ -2500,7 +2499,7 @@ void bli_zgemm_piledriver_asm_2x2
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vmovddup(mem(rbx), xmm6) // load beta_r and duplicate
|
||||
vmovddup(mem(rbx, 8), xmm7) // load beta_i and duplicate
|
||||
|
||||
@@ -2510,7 +2509,7 @@ void bli_zgemm_piledriver_asm_2x2
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
|
||||
lea(mem(, rsi, 2), rsi)
|
||||
//lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c;
|
||||
@@ -2688,19 +2687,20 @@ void bli_zgemm_piledriver_asm_2x2
|
||||
label(.ZDONE)
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next) // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c), // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -2709,7 +2709,7 @@ void bli_zgemm_piledriver_asm_2x2
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -62,20 +62,19 @@ void bli_sgemm_sandybridge_asm_8x8
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
//mov(%9, r15) // load address of b_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
//mov(var(b_next), r15) // load address of b_next.
|
||||
|
||||
vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
|
||||
vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b.
|
||||
vpermilps(imm(0x4e), ymm2, ymm3)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
|
||||
lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c;
|
||||
|
||||
@@ -100,7 +99,7 @@ void bli_sgemm_sandybridge_asm_8x8
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -248,7 +247,7 @@ void bli_sgemm_sandybridge_asm_8x8
|
||||
|
||||
label(.SCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -393,8 +392,8 @@ void bli_sgemm_sandybridge_asm_8x8
|
||||
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
|
||||
vbroadcastss(mem(rbx), ymm4) // load beta and duplicate
|
||||
|
||||
@@ -412,7 +411,7 @@ void bli_sgemm_sandybridge_asm_8x8
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
|
||||
|
||||
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
|
||||
@@ -1002,19 +1001,20 @@ void bli_sgemm_sandybridge_asm_8x8
|
||||
vzeroupper()
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -1023,7 +1023,7 @@ void bli_sgemm_sandybridge_asm_8x8
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
void bli_dgemm_sandybridge_asm_8x4
|
||||
@@ -1048,22 +1048,21 @@ void bli_dgemm_sandybridge_asm_8x4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(%9, r15) // load address of b_next.
|
||||
//mov(%10, r14) // load address of a_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
mov(var(b_next), r15) // load address of b_next.
|
||||
//mov(var(a_next), r14) // load address of a_next.
|
||||
sub(imm(4*64), r15)
|
||||
|
||||
vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
|
||||
vmovapd(mem(rbx, 0*32), ymm2) // elements of a and b.
|
||||
vpermilpd(imm(0x5), ymm2, ymm3)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
|
||||
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
|
||||
|
||||
@@ -1083,7 +1082,7 @@ void bli_dgemm_sandybridge_asm_8x4
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -1228,7 +1227,7 @@ void bli_dgemm_sandybridge_asm_8x4
|
||||
|
||||
label(.DCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -1343,8 +1342,8 @@ void bli_dgemm_sandybridge_asm_8x4
|
||||
// ab70 ) ab71 ) ab72 ) ab73 )
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
|
||||
vbroadcastsd(mem(rbx), ymm2) // load beta and duplicate
|
||||
|
||||
@@ -1362,7 +1361,7 @@ void bli_dgemm_sandybridge_asm_8x4
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
|
||||
|
||||
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
|
||||
@@ -1677,19 +1676,20 @@ void bli_dgemm_sandybridge_asm_8x4
|
||||
vzeroupper()
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (b_next)/*, // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c), // 8
|
||||
[b_next] "m" (b_next)/*, // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -1698,7 +1698,7 @@ void bli_dgemm_sandybridge_asm_8x4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
void bli_cgemm_sandybridge_asm_8x4
|
||||
@@ -1723,22 +1723,21 @@ void bli_cgemm_sandybridge_asm_8x4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(%9, r15) // load address of b_next.
|
||||
//mov(%10, r14) // load address of a_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
mov(var(b_next), r15) // load address of b_next.
|
||||
//mov(var(a_next), r14) // load address of a_next.
|
||||
sub(imm(4*64), r15)
|
||||
|
||||
vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
|
||||
vmovsldup(mem(rbx, 0*32), ymm2)
|
||||
vpermilps(imm(0x4e), ymm2, ymm3)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
|
||||
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
|
||||
|
||||
@@ -1758,7 +1757,7 @@ void bli_cgemm_sandybridge_asm_8x4
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -2004,7 +2003,7 @@ void bli_cgemm_sandybridge_asm_8x4
|
||||
|
||||
label(.CCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -2175,7 +2174,7 @@ void bli_cgemm_sandybridge_asm_8x4
|
||||
|
||||
// scale by alpha
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate
|
||||
vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate
|
||||
|
||||
@@ -2222,7 +2221,7 @@ void bli_cgemm_sandybridge_asm_8x4
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate
|
||||
vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate
|
||||
|
||||
@@ -2232,7 +2231,7 @@ void bli_cgemm_sandybridge_asm_8x4
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
|
||||
|
||||
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
|
||||
@@ -2638,19 +2637,20 @@ void bli_cgemm_sandybridge_asm_8x4
|
||||
vzeroupper()
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (b_next)/*, // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c), // 8
|
||||
[b_next] "m" (b_next)/*, // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -2659,7 +2659,7 @@ void bli_cgemm_sandybridge_asm_8x4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -2686,21 +2686,20 @@ void bli_zgemm_sandybridge_asm_4x4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
//mov(%9, r15) // load address of b_next.
|
||||
//mov(%10, r14) // load address of a_next.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
//mov(var(b_next), r15) // load address of b_next.
|
||||
//mov(var(a_next), r14) // load address of a_next.
|
||||
|
||||
vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
|
||||
vmovddup(mem(rbx, 0+0*32), ymm2)
|
||||
vmovddup(mem(rbx, 0+1*32), ymm3)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
|
||||
lea(mem(, rdi, 2), rdi)
|
||||
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
|
||||
@@ -2721,7 +2720,7 @@ void bli_zgemm_sandybridge_asm_4x4
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.ZCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -2964,7 +2963,7 @@ void bli_zgemm_sandybridge_asm_4x4
|
||||
|
||||
label(.ZCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -3083,7 +3082,7 @@ void bli_zgemm_sandybridge_asm_4x4
|
||||
|
||||
// scale by alpha
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate
|
||||
vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate
|
||||
|
||||
@@ -3130,7 +3129,7 @@ void bli_zgemm_sandybridge_asm_4x4
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate
|
||||
vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate
|
||||
|
||||
@@ -3140,7 +3139,7 @@ void bli_zgemm_sandybridge_asm_4x4
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
|
||||
lea(mem(, rsi, 2), rsi)
|
||||
lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c;
|
||||
@@ -3488,19 +3487,20 @@ void bli_zgemm_sandybridge_asm_4x4
|
||||
vzeroupper()
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -3509,7 +3509,7 @@ void bli_zgemm_sandybridge_asm_4x4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -306,7 +306,7 @@ void bli_dgemm_skx_asm_16x12_l2(
|
||||
const int64_t rs_c = rs_c_;
|
||||
const int64_t cs_c = cs_c_;
|
||||
|
||||
BEGIN_ASM
|
||||
BEGIN_ASM()
|
||||
|
||||
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
|
||||
VMOVAPD(YMM( 7), YMM(8))
|
||||
|
||||
@@ -336,7 +336,7 @@ void bli_sgemm_skx_asm_32x12_l2(
|
||||
const int64_t rs_c = rs_c_;
|
||||
const int64_t cs_c = cs_c_;
|
||||
|
||||
BEGIN_ASM
|
||||
BEGIN_ASM()
|
||||
|
||||
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
|
||||
VMOVAPD(YMM( 7), YMM(8))
|
||||
|
||||
@@ -99,14 +99,13 @@ void bli_sgemm_zen_asm_6x16
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
//mov(%9, r15) // load address of b_next.
|
||||
|
||||
add(imm(32*4), rbx)
|
||||
@@ -114,8 +113,8 @@ void bli_sgemm_zen_asm_6x16
|
||||
vmovaps(mem(rbx, -4*32), ymm0)
|
||||
vmovaps(mem(rbx, -3*32), ymm1)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%7, rdi) // load rs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(rs_c), rdi) // load rs_c
|
||||
lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float)
|
||||
|
||||
lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c;
|
||||
@@ -130,7 +129,7 @@ void bli_sgemm_zen_asm_6x16
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -257,7 +256,7 @@ void bli_sgemm_zen_asm_6x16
|
||||
|
||||
label(.SCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -305,8 +304,8 @@ void bli_sgemm_zen_asm_6x16
|
||||
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
|
||||
vbroadcastss(mem(rbx), ymm3) // load beta and duplicate
|
||||
|
||||
@@ -328,7 +327,7 @@ void bli_sgemm_zen_asm_6x16
|
||||
|
||||
|
||||
|
||||
mov(%8, rsi) // load cs_c
|
||||
mov(var(cs_c), rsi) // load cs_c
|
||||
lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float)
|
||||
|
||||
lea(mem(rcx, rsi, 8), rdx) // load address of c + 8*cs_c;
|
||||
@@ -872,19 +871,20 @@ void bli_sgemm_zen_asm_6x16
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -893,7 +893,7 @@ void bli_sgemm_zen_asm_6x16
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -945,14 +945,13 @@ void bli_dgemm_zen_asm_6x8
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
//mov(%9, r15) // load address of b_next.
|
||||
|
||||
add(imm(32*4), rbx)
|
||||
@@ -960,8 +959,8 @@ void bli_dgemm_zen_asm_6x8
|
||||
vmovapd(mem(rbx, -4*32), ymm0)
|
||||
vmovapd(mem(rbx, -3*32), ymm1)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%7, rdi) // load rs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(rs_c), rdi) // load rs_c
|
||||
lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double)
|
||||
|
||||
lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c;
|
||||
@@ -976,7 +975,7 @@ void bli_dgemm_zen_asm_6x8
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -1103,7 +1102,7 @@ void bli_dgemm_zen_asm_6x8
|
||||
|
||||
label(.DCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -1151,8 +1150,8 @@ void bli_dgemm_zen_asm_6x8
|
||||
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
|
||||
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
|
||||
|
||||
@@ -1174,7 +1173,7 @@ void bli_dgemm_zen_asm_6x8
|
||||
|
||||
|
||||
|
||||
mov(%8, rsi) // load cs_c
|
||||
mov(var(cs_c), rsi) // load cs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
|
||||
|
||||
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c;
|
||||
@@ -1611,19 +1610,20 @@ void bli_dgemm_zen_asm_6x8
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -1632,7 +1632,7 @@ void bli_dgemm_zen_asm_6x8
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -1691,14 +1691,13 @@ void bli_cgemm_zen_asm_3x8
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
//mov(%9, r15) // load address of b_next.
|
||||
|
||||
add(imm(32*4), rbx)
|
||||
@@ -1706,8 +1705,8 @@ void bli_cgemm_zen_asm_3x8
|
||||
vmovaps(mem(rbx, -4*32), ymm0)
|
||||
vmovaps(mem(rbx, -3*32), ymm1)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%7, rdi) // load rs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(rs_c), rdi) // load rs_c
|
||||
lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(scomplex)
|
||||
|
||||
lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*rs_c;
|
||||
@@ -1720,7 +1719,7 @@ void bli_cgemm_zen_asm_3x8
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -1847,7 +1846,7 @@ void bli_cgemm_zen_asm_3x8
|
||||
|
||||
label(.CCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -1916,7 +1915,7 @@ void bli_cgemm_zen_asm_3x8
|
||||
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate
|
||||
vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate
|
||||
|
||||
@@ -1957,14 +1956,14 @@ void bli_cgemm_zen_asm_3x8
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate
|
||||
vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate
|
||||
|
||||
|
||||
|
||||
|
||||
mov(%8, rsi) // load cs_c
|
||||
mov(var(cs_c), rsi) // load cs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(scomplex)
|
||||
lea(mem(, rsi, 4), rdx) // rdx = 4*cs_c;
|
||||
lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c;
|
||||
@@ -2143,19 +2142,20 @@ void bli_cgemm_zen_asm_3x8
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -2164,7 +2164,7 @@ void bli_cgemm_zen_asm_3x8
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -2219,14 +2219,13 @@ void bli_zgemm_zen_asm_3x4
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
//mov(%9, r15) // load address of b_next.
|
||||
|
||||
add(imm(32*4), rbx)
|
||||
@@ -2234,8 +2233,8 @@ void bli_zgemm_zen_asm_3x4
|
||||
vmovapd(mem(rbx, -4*32), ymm0)
|
||||
vmovapd(mem(rbx, -3*32), ymm1)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%7, rdi) // load rs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(rs_c), rdi) // load rs_c
|
||||
lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dcomplex)
|
||||
lea(mem(, rdi, 2), rdi)
|
||||
|
||||
@@ -2249,7 +2248,7 @@ void bli_zgemm_zen_asm_3x4
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.ZCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -2376,7 +2375,7 @@ void bli_zgemm_zen_asm_3x4
|
||||
|
||||
label(.ZCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -2444,7 +2443,7 @@ void bli_zgemm_zen_asm_3x4
|
||||
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate
|
||||
vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate
|
||||
|
||||
@@ -2485,14 +2484,14 @@ void bli_zgemm_zen_asm_3x4
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
|
||||
vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
|
||||
|
||||
|
||||
|
||||
|
||||
mov(%8, rsi) // load cs_c
|
||||
mov(var(cs_c), rsi) // load cs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dcomplex)
|
||||
lea(mem(, rsi, 2), rsi)
|
||||
lea(mem(, rsi, 2), rdx) // rdx = 2*cs_c;
|
||||
@@ -2671,19 +2670,20 @@ void bli_zgemm_zen_asm_3x4
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -2692,7 +2692,7 @@ void bli_zgemm_zen_asm_3x4
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -98,14 +98,13 @@ void bli_sgemm_zen_asm_16x6
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
//mov(%9, r15) // load address of b_next.
|
||||
|
||||
add(imm(32*4), rax)
|
||||
@@ -113,8 +112,8 @@ void bli_sgemm_zen_asm_16x6
|
||||
vmovaps(mem(rax, -4*32), ymm0)
|
||||
vmovaps(mem(rax, -3*32), ymm1)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
|
||||
|
||||
lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c;
|
||||
@@ -129,7 +128,7 @@ void bli_sgemm_zen_asm_16x6
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -256,7 +255,7 @@ void bli_sgemm_zen_asm_16x6
|
||||
|
||||
label(.SCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -304,8 +303,8 @@ void bli_sgemm_zen_asm_16x6
|
||||
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
|
||||
vbroadcastss(mem(rbx), ymm3) // load beta and duplicate
|
||||
|
||||
@@ -327,7 +326,7 @@ void bli_sgemm_zen_asm_16x6
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
|
||||
|
||||
lea(mem(rcx, rsi, 8), rdx) // load address of c + 8*rs_c;
|
||||
@@ -614,19 +613,20 @@ void bli_sgemm_zen_asm_16x6
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -635,7 +635,7 @@ void bli_sgemm_zen_asm_16x6
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
#define DGEMM_INPUT_GS_BETA_NZ \
|
||||
@@ -684,14 +684,13 @@ void bli_dgemm_zen_asm_8x6
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
//mov(%9, r15) // load address of b_next.
|
||||
|
||||
add(imm(32*4), rax)
|
||||
@@ -699,8 +698,8 @@ void bli_dgemm_zen_asm_8x6
|
||||
vmovapd(mem(rax, -4*32), ymm0)
|
||||
vmovapd(mem(rax, -3*32), ymm1)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
|
||||
|
||||
lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c;
|
||||
@@ -715,7 +714,7 @@ void bli_dgemm_zen_asm_8x6
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -842,7 +841,7 @@ void bli_dgemm_zen_asm_8x6
|
||||
|
||||
label(.DCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -890,8 +889,8 @@ void bli_dgemm_zen_asm_8x6
|
||||
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
|
||||
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
|
||||
|
||||
@@ -913,7 +912,7 @@ void bli_dgemm_zen_asm_8x6
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
|
||||
|
||||
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
|
||||
@@ -1199,19 +1198,20 @@ void bli_dgemm_zen_asm_8x6
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -1220,7 +1220,7 @@ void bli_dgemm_zen_asm_8x6
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -1279,14 +1279,13 @@ void bli_cgemm_zen_asm_8x3
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
//mov(%9, r15) // load address of b_next.
|
||||
|
||||
add(imm(32*4), rax)
|
||||
@@ -1294,8 +1293,8 @@ void bli_cgemm_zen_asm_8x3
|
||||
vmovaps(mem(rax, -4*32), ymm0)
|
||||
vmovaps(mem(rax, -3*32), ymm1)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
|
||||
|
||||
lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*cs_c;
|
||||
@@ -1308,7 +1307,7 @@ void bli_cgemm_zen_asm_8x3
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -1435,7 +1434,7 @@ void bli_cgemm_zen_asm_8x3
|
||||
|
||||
label(.CCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -1504,7 +1503,7 @@ void bli_cgemm_zen_asm_8x3
|
||||
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate
|
||||
vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate
|
||||
|
||||
@@ -1545,14 +1544,14 @@ void bli_cgemm_zen_asm_8x3
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate
|
||||
vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate
|
||||
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
|
||||
lea(mem(, rsi, 4), rdx) // rdx = 4*rs_c;
|
||||
lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c;
|
||||
@@ -1731,19 +1730,20 @@ void bli_cgemm_zen_asm_8x3
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -1752,7 +1752,7 @@ void bli_cgemm_zen_asm_8x3
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -1807,14 +1807,13 @@ void bli_zgemm_zen_asm_4x3
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a), rax) // load address of a.
|
||||
mov(var(b), rbx) // load address of b.
|
||||
//mov(%9, r15) // load address of b_next.
|
||||
|
||||
add(imm(32*4), rax)
|
||||
@@ -1822,8 +1821,8 @@ void bli_zgemm_zen_asm_4x3
|
||||
vmovapd(mem(rax, -4*32), ymm0)
|
||||
vmovapd(mem(rax, -3*32), ymm1)
|
||||
|
||||
mov(%6, rcx) // load address of c
|
||||
mov(%8, rdi) // load cs_c
|
||||
mov(var(c), rcx) // load address of c
|
||||
mov(var(cs_c), rdi) // load cs_c
|
||||
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
|
||||
lea(mem(, rdi, 2), rdi)
|
||||
|
||||
@@ -1837,7 +1836,7 @@ void bli_zgemm_zen_asm_4x3
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.ZCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -1964,7 +1963,7 @@ void bli_zgemm_zen_asm_4x3
|
||||
|
||||
label(.ZCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -2032,7 +2031,7 @@ void bli_zgemm_zen_asm_4x3
|
||||
|
||||
|
||||
|
||||
mov(%4, rax) // load address of alpha
|
||||
mov(var(alpha), rax) // load address of alpha
|
||||
vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate
|
||||
vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate
|
||||
|
||||
@@ -2073,14 +2072,14 @@ void bli_zgemm_zen_asm_4x3
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of beta
|
||||
mov(var(beta), rbx) // load address of beta
|
||||
vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
|
||||
vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
|
||||
|
||||
|
||||
|
||||
|
||||
mov(%7, rsi) // load rs_c
|
||||
mov(var(rs_c), rsi) // load rs_c
|
||||
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
|
||||
lea(mem(, rsi, 2), rsi)
|
||||
lea(mem(, rsi, 2), rdx) // rdx = 2*rs_c;
|
||||
@@ -2259,19 +2258,20 @@ void bli_zgemm_zen_asm_4x3
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a] "m" (a), // 2
|
||||
[b] "m" (b), // 3
|
||||
[alpha] "m" (alpha), // 4
|
||||
[beta] "m" (beta), // 5
|
||||
[c] "m" (c), // 6
|
||||
[rs_c] "m" (rs_c), // 7
|
||||
[cs_c] "m" (cs_c)/*, // 8
|
||||
[b_next] "m" (b_next), // 9
|
||||
[a_next] "m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -2280,7 +2280,7 @@ void bli_zgemm_zen_asm_4x3
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -81,36 +81,35 @@ void bli_sgemmtrsm_l_zen_asm_6x16
|
||||
|
||||
float* beta = bli_sm1;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a10), rax) // load address of a.
|
||||
mov(var(b01), rbx) // load address of b.
|
||||
|
||||
add(imm(32*4), rbx)
|
||||
// initialize loop by pre-loading
|
||||
vmovaps(mem(rbx, -4*32), ymm0)
|
||||
vmovaps(mem(rbx, -3*32), ymm1)
|
||||
|
||||
mov(%7, rcx) // load address of b11
|
||||
mov(var(b11), rcx) // load address of b11
|
||||
mov(imm(16), rdi) // set rs_b = PACKNR = 16
|
||||
lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float)
|
||||
|
||||
// NOTE: c11, rs_c, and cs_c aren't
|
||||
// needed for a while, but we load
|
||||
// them now to avoid stalling later.
|
||||
mov(%8, r8) // load address of c11
|
||||
mov(%9, r9) // load rs_c
|
||||
mov(var(c11), r8) // load address of c11
|
||||
mov(var(rs_c), r9) // load rs_c
|
||||
lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float)
|
||||
mov(%10, r10) // load cs_c
|
||||
mov(var(k_left)0, r10) // load cs_c
|
||||
lea(mem(, r10, 4), r10) // cs_c *= sizeof(float)
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -237,7 +236,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16
|
||||
|
||||
label(.SCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -286,7 +285,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of alpha
|
||||
mov(var(alpha), rbx) // load address of alpha
|
||||
vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate
|
||||
|
||||
|
||||
@@ -365,7 +364,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16
|
||||
// ymm14 ymm15 = ( beta50..57 ) ( beta58..5F )
|
||||
|
||||
|
||||
mov(%6, rax) // load address of a11
|
||||
mov(var(a11), rax) // load address of a11
|
||||
|
||||
mov(r11, rcx) // recall address of b11
|
||||
mov(r14, rdx) // recall address of b11+8*cs_b
|
||||
@@ -772,19 +771,20 @@ void bli_sgemmtrsm_l_zen_asm_6x16
|
||||
vzeroupper()
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a10), // 2
|
||||
"m" (b01), // 3
|
||||
"m" (beta), // 4
|
||||
"m" (alpha), // 5
|
||||
"m" (a11), // 6
|
||||
"m" (b11), // 7
|
||||
"m" (c11), // 8
|
||||
"m" (rs_c), // 9
|
||||
"m" (cs_c) // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a10] "m" (a10), // 2
|
||||
[b01] "m" (b01), // 3
|
||||
[beta] "m" (beta), // 4
|
||||
[alpha] "m" (alpha), // 5
|
||||
[a11] "m" (a11), // 6
|
||||
[b11] "m" (b11), // 7
|
||||
[c11] "m" (c11), // 8
|
||||
[rs_c] "m" (rs_c), // 9
|
||||
[cs_c] "m" (cs_c) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -793,7 +793,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -835,36 +835,35 @@ void bli_dgemmtrsm_l_zen_asm_6x8
|
||||
|
||||
double* beta = bli_dm1;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a10), rax) // load address of a.
|
||||
mov(var(b01), rbx) // load address of b.
|
||||
|
||||
add(imm(32*4), rbx)
|
||||
// initialize loop by pre-loading
|
||||
vmovapd(mem(rbx, -4*32), ymm0)
|
||||
vmovapd(mem(rbx, -3*32), ymm1)
|
||||
|
||||
mov(%7, rcx) // load address of b11
|
||||
mov(var(b11), rcx) // load address of b11
|
||||
mov(imm(8), rdi) // set rs_b = PACKNR = 8
|
||||
lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double)
|
||||
|
||||
// NOTE: c11, rs_c, and cs_c aren't
|
||||
// needed for a while, but we load
|
||||
// them now to avoid stalling later.
|
||||
mov(%8, r8) // load address of c11
|
||||
mov(%9, r9) // load rs_c
|
||||
mov(var(c11), r8) // load address of c11
|
||||
mov(var(rs_c), r9) // load rs_c
|
||||
lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double)
|
||||
mov(%10, r10) // load cs_c
|
||||
mov(var(k_left)0, r10) // load cs_c
|
||||
lea(mem(, r10, 8), r10) // cs_c *= sizeof(double)
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -991,7 +990,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8
|
||||
|
||||
label(.DCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -1041,7 +1040,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of alpha
|
||||
mov(var(alpha), rbx) // load address of alpha
|
||||
vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate
|
||||
|
||||
|
||||
@@ -1120,7 +1119,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8
|
||||
// ymm14 ymm15 = ( beta50..53 ) ( beta54..57 )
|
||||
|
||||
|
||||
mov(%6, rax) // load address of a11
|
||||
mov(var(a11), rax) // load address of a11
|
||||
|
||||
mov(r11, rcx) // recall address of b11
|
||||
mov(r14, rdx) // recall address of b11+4*cs_b
|
||||
@@ -1488,19 +1487,20 @@ void bli_dgemmtrsm_l_zen_asm_6x8
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a10), // 2
|
||||
"m" (b01), // 3
|
||||
"m" (beta), // 4
|
||||
"m" (alpha), // 5
|
||||
"m" (a11), // 6
|
||||
"m" (b11), // 7
|
||||
"m" (c11), // 8
|
||||
"m" (rs_c), // 9
|
||||
"m" (cs_c) // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a10] "m" (a10), // 2
|
||||
[b01] "m" (b01), // 3
|
||||
[beta] "m" (beta), // 4
|
||||
[alpha] "m" (alpha), // 5
|
||||
[a11] "m" (a11), // 6
|
||||
[b11] "m" (b11), // 7
|
||||
[c11] "m" (c11), // 8
|
||||
[rs_c] "m" (rs_c), // 9
|
||||
[cs_c] "m" (cs_c) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -1509,7 +1509,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -81,36 +81,35 @@ void bli_sgemmtrsm_u_zen_asm_6x16
|
||||
|
||||
float* beta = bli_sm1;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a10), rax) // load address of a.
|
||||
mov(var(b01), rbx) // load address of b.
|
||||
|
||||
add(imm(32*4), rbx)
|
||||
// initialize loop by pre-loading
|
||||
vmovaps(mem(rbx, -4*32), ymm0)
|
||||
vmovaps(mem(rbx, -3*32), ymm1)
|
||||
|
||||
mov(%7, rcx) // load address of b11
|
||||
mov(var(b11), rcx) // load address of b11
|
||||
mov(imm(16), rdi) // set rs_b = PACKNR = 16
|
||||
lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float)
|
||||
|
||||
// NOTE: c11, rs_c, and cs_c aren't
|
||||
// needed for a while, but we load
|
||||
// them now to avoid stalling later.
|
||||
mov(%8, r8) // load address of c11
|
||||
mov(%9, r9) // load rs_c
|
||||
mov(var(c11), r8) // load address of c11
|
||||
mov(var(rs_c), r9) // load rs_c
|
||||
lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float)
|
||||
mov(%10, r10) // load cs_c
|
||||
mov(var(k_left)0, r10) // load cs_c
|
||||
lea(mem(, r10, 4), r10) // cs_c *= sizeof(float)
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -237,7 +236,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16
|
||||
|
||||
label(.SCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -286,7 +285,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of alpha
|
||||
mov(var(alpha), rbx) // load address of alpha
|
||||
vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate
|
||||
|
||||
|
||||
@@ -365,7 +364,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16
|
||||
// ymm14 ymm15 = ( beta50..57 ) ( beta58..5F )
|
||||
|
||||
|
||||
mov(%6, rax) // load address of a11
|
||||
mov(var(a11), rax) // load address of a11
|
||||
|
||||
mov(r11, rcx) // recall address of b11
|
||||
mov(r14, rdx) // recall address of b11+8*cs_b
|
||||
@@ -776,20 +775,20 @@ void bli_sgemmtrsm_u_zen_asm_6x16
|
||||
|
||||
vzeroupper()
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a10), // 2
|
||||
"m" (b01), // 3
|
||||
"m" (beta), // 4
|
||||
"m" (alpha), // 5
|
||||
"m" (a11), // 6
|
||||
"m" (b11), // 7
|
||||
"m" (c11), // 8
|
||||
"m" (rs_c), // 9
|
||||
"m" (cs_c) // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a10] "m" (a10), // 2
|
||||
[b01] "m" (b01), // 3
|
||||
[beta] "m" (beta), // 4
|
||||
[alpha] "m" (alpha), // 5
|
||||
[a11] "m" (a11), // 6
|
||||
[b11] "m" (b11), // 7
|
||||
[c11] "m" (c11), // 8
|
||||
[rs_c] "m" (rs_c), // 9
|
||||
[cs_c] "m" (cs_c) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -798,7 +797,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -840,36 +839,35 @@ void bli_dgemmtrsm_u_zen_asm_6x8
|
||||
|
||||
double* beta = bli_dm1;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
begin_asm()
|
||||
|
||||
vzeroall() // zero all xmm/ymm registers.
|
||||
|
||||
|
||||
mov(%2, rax) // load address of a.
|
||||
mov(%3, rbx) // load address of b.
|
||||
mov(var(a10), rax) // load address of a.
|
||||
mov(var(b01), rbx) // load address of b.
|
||||
|
||||
add(imm(32*4), rbx)
|
||||
// initialize loop by pre-loading
|
||||
vmovapd(mem(rbx, -4*32), ymm0)
|
||||
vmovapd(mem(rbx, -3*32), ymm1)
|
||||
|
||||
mov(%7, rcx) // load address of b11
|
||||
mov(var(b11), rcx) // load address of b11
|
||||
mov(imm(8), rdi) // set rs_b = PACKNR = 8
|
||||
lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double)
|
||||
|
||||
// NOTE: c11, rs_c, and cs_c aren't
|
||||
// needed for a while, but we load
|
||||
// them now to avoid stalling later.
|
||||
mov(%8, r8) // load address of c11
|
||||
mov(%9, r9) // load rs_c
|
||||
mov(var(c11), r8) // load address of c11
|
||||
mov(var(rs_c), r9) // load rs_c
|
||||
lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double)
|
||||
mov(%10, r10) // load cs_c
|
||||
mov(var(k_left)0, r10) // load cs_c
|
||||
lea(mem(, r10, 8), r10) // cs_c *= sizeof(double)
|
||||
|
||||
|
||||
|
||||
mov(%0, rsi) // i = k_iter;
|
||||
mov(var(k_iter), rsi) // i = k_iter;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DCONSIDKLEFT) // if i == 0, jump to code that
|
||||
// contains the k_left loop.
|
||||
@@ -996,7 +994,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8
|
||||
|
||||
label(.DCONSIDKLEFT)
|
||||
|
||||
mov(%1, rsi) // i = k_left;
|
||||
mov(var(k_left), rsi) // i = k_left;
|
||||
test(rsi, rsi) // check i via logical AND.
|
||||
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
|
||||
// else, we prepare to enter k_left loop.
|
||||
@@ -1046,7 +1044,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8
|
||||
|
||||
|
||||
|
||||
mov(%5, rbx) // load address of alpha
|
||||
mov(var(alpha), rbx) // load address of alpha
|
||||
vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate
|
||||
|
||||
|
||||
@@ -1125,7 +1123,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8
|
||||
// ymm14 ymm15 = ( beta50..53 ) ( beta54..57 )
|
||||
|
||||
|
||||
mov(%6, rax) // load address of a11
|
||||
mov(var(a11), rax) // load address of a11
|
||||
|
||||
mov(r11, rcx) // recall address of b11
|
||||
mov(r14, rdx) // recall address of b11+4*cs_b
|
||||
@@ -1497,20 +1495,20 @@ void bli_dgemmtrsm_u_zen_asm_6x8
|
||||
vzeroupper()
|
||||
|
||||
|
||||
|
||||
end_asm(
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a10), // 2
|
||||
"m" (b01), // 3
|
||||
"m" (beta), // 4
|
||||
"m" (alpha), // 5
|
||||
"m" (a11), // 6
|
||||
"m" (b11), // 7
|
||||
"m" (c11), // 8
|
||||
"m" (rs_c), // 9
|
||||
"m" (cs_c) // 10
|
||||
[k_iter] "m" (k_iter), // 0
|
||||
[k_left] "m" (k_left), // 1
|
||||
[a10] "m" (a10), // 2
|
||||
[b01] "m" (b01), // 3
|
||||
[beta] "m" (beta), // 4
|
||||
[alpha] "m" (alpha), // 5
|
||||
[a11] "m" (a11), // 6
|
||||
[b11] "m" (b11), // 7
|
||||
[c11] "m" (c11), // 8
|
||||
[rs_c] "m" (rs_c), // 9
|
||||
[cs_c] "m" (cs_c) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
@@ -1519,7 +1517,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user