Merge pull request #226 from devinamatthews/dev

Finish macroization of assembly ukernels.
This commit is contained in:
Devin Matthews
2018-06-25 14:26:06 -05:00
committed by GitHub
19 changed files with 726 additions and 721 deletions

View File

@@ -98,7 +98,7 @@
#define COMMENT_BEGIN "#"
#define COMMENT_END
#define BEGIN_ASM __asm__ volatile (
#define BEGIN_ASM() __asm__ volatile (
#define END_ASM(...) __VA_ARGS__ );
@@ -149,8 +149,8 @@
#endif
#define begin_asm BEGIN_ASM
#define end_asm END_ASM
#define begin_asm() BEGIN_ASM()
#define end_asm(...) END_ASM(__VA_ARGS__)
#define label(...) LABEL(__VA_ARGS__)
#define imm(...) IMM(__VA_ARGS__)

View File

@@ -107,18 +107,17 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b.
vpermilps(imm(0x4e), ymm2, ymm3)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c;
@@ -142,7 +141,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
vxorps(ymm15, ymm15, ymm15)
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.SCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -251,7 +250,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
label(.SCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -348,8 +347,8 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
// ab61 ab63 ab65 ab67
// ab71 ) ab73 ) ab75 ) ab77 )
mov(%4, rax) // load address of alpha
mov(%5, rbx) // load address of beta
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
vbroadcastss(mem(rbx), ymm4) // load beta and duplicate
@@ -365,7 +364,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
@@ -740,19 +739,20 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
label(.SDONE)
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -761,7 +761,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
#undef KERNEL4x6_1
@@ -879,13 +879,12 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__
(
begin_asm()
vzeroall()
mov(%3, rbx) // load address of b.
mov(%2, rax) // load address of a.
mov(var(b), rbx) // load address of b.
mov(var(a), rax) // load address of a.
prefetch(0, mem(rax, 64))
@@ -895,7 +894,7 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
add(imm(12*8), rbx)
add(imm(8*8), rax)
mov(%0, rsi) // i = k_iter; notice %0 not $0
mov(var(k_iter), rsi) // i = k_iter; notice var(k_iter) not $0
test(rsi, rsi)
je(.CONSIDERKLEFT)
@@ -920,7 +919,7 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
label(.CONSIDERKLEFT)
mov(%1, rsi)
mov(var(k_left), rsi)
test(rsi, rsi)
label(.LOOPKLEFT)
je(.POSTACCUM)
@@ -935,11 +934,11 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
label(.POSTACCUM)
mov(%7, rsi) // load cs_c
mov(%8, rdi) // load rs_c
vmovddup(mem(%4), xmm2) //load alpha
vmovddup(mem(%5), xmm3) //load beta
mov(%6, rcx) // load address of c
mov(var(rs_c), rsi) // load cs_c
mov(var(cs_c), rdi) // load rs_c
vmovddup(mem(var(alpha)), xmm2) //load alpha
vmovddup(mem(var(beta)), xmm3) //load beta
mov(var(c), rcx) // load address of c
sal(imm(3), rsi) // cs_c *= sizeof(double)
sal(imm(3), rdi) // rs_c *= sizeof(double)
lea(mem(rcx, rdi, 2), rdx)
@@ -1034,17 +1033,20 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
vmovhpd(xmm14, mem(rdx, rdi, 1))
vmovhpd(xmm15, mem(r8, rdi, 1))
end_asm(
: // output operands (none)
: // input operands
"r" (k_iter),
"r" (k_left),
"r" (a),
"r" (b),
"r" (alpha),
"r" (beta),
"r" (c),
"m" (rs_c),
"m" (cs_c)
[k_iter] "r" (k_iter), // 0
[k_left] "r" (k_left), // 1
[a] "r" (a), // 2
[b] "r" (b), // 3
[alpha] "r" (alpha), // 4
[beta] "r" (beta), // 5
[c] "r" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8",
"xmm0", "xmm1", "xmm2", "xmm3",
@@ -1052,7 +1054,7 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
//The parameter "i" is the iteration number, i.e. the B values to read
#define MADD_TO_YMM(i) \
@@ -1094,21 +1096,20 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(%9, r15) // load address of b_next.
//mov(%10, r14) // load address of a_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
mov(var(b_next), r15) // load address of b_next.
//mov(var(a_next), r14) // load address of a_next.
sub(imm(4*64), r15)
vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
vmovsldup(mem(rbx, 0*32), ymm2)
vpermilps(imm(0x4e), ymm2, ymm3)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
@@ -1126,7 +1127,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
vxorps(ymm14, ymm14, ymm14)
vxorps(ymm15, ymm15, ymm15)
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.CCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -1266,7 +1267,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
label(.CCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -1377,7 +1378,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
// scale by alpha
mov(%4, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate
vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate
@@ -1424,7 +1425,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
mov(%5, rbx) // load address of beta
mov(var(beta), rbx) // load address of beta
vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate
vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate
@@ -1434,7 +1435,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
@@ -1835,19 +1836,20 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
label(.CDONE)
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (b_next)/*, // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c), // 8
[b_next] "m" (b_next)/*, // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -1856,7 +1858,7 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12", "ymm13", "ymm14", "ymm15",
"memory"
);
)
}
#define MADDSUBPD_TO_YMM \
@@ -1905,21 +1907,20 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
//mov(%9, r15) // load address of b_next.
//mov(%10, r14) // load address of a_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
//mov(var(b_next), r15) // load address of b_next.
//mov(var(a_next), r14) // load address of a_next.
vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
vmovddup(mem(rbx, 0+0*32), ymm2)
vmovddup(mem(rbx, 0+1*32), ymm3)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
lea(mem(, rdi, 2), rdi)
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
@@ -1939,7 +1940,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
vxorpd(ymm15, ymm15, ymm15)
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.ZCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -2083,7 +2084,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
label(.ZCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -2176,7 +2177,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
// scale by alpha
mov(%4, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate
vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate
@@ -2190,13 +2191,13 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
Z_ALPHA(9, 1)
Z_ALPHA(8, 0)
mov(%5, rbx) // load address of beta
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate
vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
lea(mem(, rsi, 2), rsi)
lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c;
@@ -2508,19 +2509,20 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
label(.ZDONE)
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -2529,6 +2531,6 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
"ymm8", "ymm9", "ymm10", "ymm11",
"ymm12", "ymm13", "ymm14", "ymm15",
"memory"
);
)
}

View File

@@ -127,7 +127,7 @@ void bli_dpackm_knl_asm_8xk
const int64_t lda = lda_;
const int64_t ldp = ldp_;
BEGIN_ASM
BEGIN_ASM()
MOV(RSI, VAR(n))
MOV(RAX, VAR(a))
@@ -321,8 +321,8 @@ void bli_dpackm_knl_asm_24xk
const int64_t lda = lda_;
const int64_t ldp = ldp_;
__asm__ volatile
(
BEGIN_ASM()
MOV(RSI, VAR(n))
MOV(RAX, VAR(a))
MOV(RBX, VAR(inca))
@@ -524,6 +524,7 @@ void bli_dpackm_knl_asm_24xk
LABEL(PACK24_DONE)
END_ASM(
: //output operands
: //input operands
[n] "m" (n),
@@ -543,5 +544,5 @@ void bli_dpackm_knl_asm_24xk
"zmm30", "zmm31",
"rax", "rbx", "rcx", "rdi", "rsi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory"
);
)
}

View File

@@ -129,8 +129,8 @@ void bli_spackm_knl_asm_16xk
const int64_t lda = lda_;
const int64_t ldp = ldp_;
__asm__ volatile
(
BEGIN_ASM()
MOV(RSI, VAR(n))
MOV(RAX, VAR(a))
MOV(RBX, VAR(inca))
@@ -295,6 +295,7 @@ void bli_spackm_knl_asm_16xk
LABEL(PACK16_DONE)
END_ASM(
: //output operands
: //input operands
[n] "m" (n),
@@ -314,7 +315,7 @@ void bli_spackm_knl_asm_16xk
"zmm30", "zmm31",
"rax", "rbx", "rcx", "rdx", "rdi", "rsi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
);
)
}
void bli_spackm_knl_asm_24xk
@@ -338,8 +339,8 @@ void bli_spackm_knl_asm_24xk
const int64_t lda = lda_;
const int64_t ldp = ldp_;
__asm__ volatile
(
BEGIN_ASM()
MOV(RSI, VAR(n))
MOV(RAX, VAR(a))
MOV(RBX, VAR(inca))
@@ -540,6 +541,7 @@ void bli_spackm_knl_asm_24xk
LABEL(PACK24_DONE)
END_ASM(
: //output operands
: //input operands
[n] "m" (n),
@@ -559,5 +561,5 @@ void bli_spackm_knl_asm_24xk
"zmm30", "zmm31",
"rax", "rbx", "rcx", "rdx", "rdi", "rsi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
);
)
}

View File

@@ -213,7 +213,7 @@ void bli_dgemm_knl_asm_24x8
int tlooph, tloopl, blooph, bloopl;
#endif
BEGIN_ASM
BEGIN_ASM()
#ifdef MONITORS
RDTSC

View File

@@ -210,8 +210,8 @@ void bli_sgemm_knl_asm_24x16
int tlooph, tloopl, blooph, bloopl;
#endif
__asm__ volatile
(
BEGIN_ASM()
#ifdef MONITORS
RDTSC
MOV(VAR(topl), EAX)
@@ -664,6 +664,8 @@ void bli_sgemm_knl_asm_24x16
MOV(VAR(botl), EAX)
MOV(VAR(both), EDX)
#endif
END_ASM(
: // output operands
#ifdef MONITORS
[topl] "=m" (topl),
@@ -694,7 +696,7 @@ void bli_sgemm_knl_asm_24x16
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31", "memory"
);
)
#ifdef LOOPMON
printf("looptime = \t%d\n", bloopl - tloopl);

View File

@@ -59,13 +59,12 @@ void bli_sgemm_penryn_asm_8x4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(%9, r9) // load address of b_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
mov(var(b_next), r9) // load address of b_next.
sub(imm(0-8*16), rax) // increment pointers to allow byte
sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
@@ -74,8 +73,8 @@ void bli_sgemm_penryn_asm_8x4
movaps(mem(rax, -7*16), xmm1) // of a and b.
movaps(mem(rbx, -8*16), xmm2)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
mov(rdi, r12) // make a copy of cs_c (in bytes)
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
@@ -102,7 +101,7 @@ void bli_sgemm_penryn_asm_8x4
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.SCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -252,7 +251,7 @@ void bli_sgemm_penryn_asm_8x4
label(.SCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -307,15 +306,15 @@ void bli_sgemm_penryn_asm_8x4
addps(xmm5, xmm15)
mov(%4, rax) // load address of alpha
mov(%5, rbx) // load address of beta
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
movss(mem(rax), xmm6) // load alpha to bottom 4 bytes of xmm6
movss(mem(rbx), xmm7) // load beta to bottom 4 bytes of xmm7
pshufd(imm(0x00), xmm6, xmm6) // populate xmm6 with four alphas
pshufd(imm(0x00), xmm7, xmm7) // populate xmm7 with four betas
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
mov(rsi, r8) // make a copy of rs_c
lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
@@ -821,18 +820,20 @@ void bli_sgemm_penryn_asm_8x4
label(.SDONE)
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter),
"m" (k_left),
"m" (a),
"m" (b),
"m" (alpha),
"m" (beta),
"m" (c),
"m" (rs_c),
"m" (cs_c),
"m" (b_next)
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c), // 8
[b_next] "m" (b_next)/*, // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
"xmm0", "xmm1", "xmm2", "xmm3",
@@ -840,7 +841,7 @@ void bli_sgemm_penryn_asm_8x4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
void bli_dgemm_penryn_asm_4x4
@@ -865,14 +866,13 @@ void bli_dgemm_penryn_asm_4x4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(%9, r9) // load address of b_next.
mov(%10, r11) // load address of a_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
mov(var(b_next), r9) // load address of b_next.
mov(var(a_next), r11) // load address of a_next.
sub(imm(0-8*16), rax) // increment pointers to allow byte
sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
@@ -881,8 +881,8 @@ void bli_dgemm_penryn_asm_4x4
movaps(mem(rax, -7*16), xmm1) // of a and b.
movaps(mem(rbx, -8*16), xmm2)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
mov(rdi, r12) // make a copy of cs_c (in bytes)
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
@@ -909,7 +909,7 @@ void bli_dgemm_penryn_asm_4x4
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -1076,7 +1076,7 @@ void bli_dgemm_penryn_asm_4x4
label(.DCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -1132,13 +1132,13 @@ void bli_dgemm_penryn_asm_4x4
addpd(xmm6, xmm14)
mov(%4, rax) // load address of alpha
mov(%5, rbx) // load address of beta
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
movddup(mem(rax), xmm6) // load alpha and duplicate
movddup(mem(rbx), xmm7) // load beta and duplicate
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
mov(rsi, r8) // make a copy of rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
@@ -1467,19 +1467,20 @@ void bli_dgemm_penryn_asm_4x4
label(.DDONE)
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (b_next), // 9
"m" (a_next) // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c), // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
"xmm0", "xmm1", "xmm2", "xmm3",
@@ -1487,7 +1488,7 @@ void bli_dgemm_penryn_asm_4x4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}

View File

@@ -76,12 +76,11 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a10.
mov(%4, rbx) // load address of b01.
//mov(%10, r9) // load address of b_next.
mov(var(a10), rax) // load address of a10.
mov(var(b01), rbx) // load address of b01.
//mov(var(b_next), r9) // load address of b_next.
sub(imm(0-8*16), rax) // increment pointers to allow byte
sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
@@ -90,8 +89,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
movaps(mem(rax, -7*16), xmm1) // of a and b.
movaps(mem(rbx, -8*16), xmm2)
//mov(%6, rcx) // load address of c11
//mov(%9, rdi) // load cs_c
//mov(var(c11), rcx) // load address of c11
//mov(var(rs_c), rdi) // load cs_c
//lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
//lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*cs_c;
@@ -117,7 +116,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.CONSIDERKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -270,7 +269,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
label(.CONSIDERKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.POSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -327,7 +326,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
mov(%5, rbx) // load address of b11.
mov(var(b11), rbx) // load address of b11.
// xmm8: xmm9: xmm10: xmm11:
// ( ab01 ( ab00 ( ab03 ( ab02
@@ -361,7 +360,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
// xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
// xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
mov(%9, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
movddup(mem(rax), xmm15) // load alpha and duplicate
movaps(mem(rbx, 0*16), xmm8)
@@ -400,11 +399,11 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
label(.TRSM)
mov(%3, rax) // load address of a11
mov(%6, rcx) // load address of c11
mov(var(a11), rax) // load address of a11
mov(var(c11), rcx) // load address of c11
mov(%7, rsi) // load rs_c
mov(%8, rdi) // load cs_c
mov(var(rs_c), rsi) // load rs_c
mov(var(cs_c), rdi) // load cs_c
sal(imm(3), rsi) // rs_c *= sizeof( double )
sal(imm(3), rdi) // cs_c *= sizeof( double )
@@ -519,19 +518,20 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a10), // 2
"m" (a11), // 3
"m" (b01), // 4
"m" (b11), // 5
"m" (c11), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (alpha), // 9
"m" (b_next) // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a10] "m" (a10), // 2
[a11] "m" (a11), // 3
[b01] "m" (b01), // 4
[b11] "m" (b11), // 5
[c11] "m" (c11), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c), // 8
[alpha] "m" (alpha), // 9
[b_next] "m" (b_next) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10",
"xmm0", "xmm1", "xmm2", "xmm3",
@@ -539,7 +539,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}

View File

@@ -76,12 +76,11 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a12.
mov(%4, rbx) // load address of b21.
//mov(%10, r9) // load address of b_next.
mov(var(a12), rax) // load address of a12.
mov(var(b21), rbx) // load address of b21.
//mov(var(b_next), r9) // load address of b_next.
add(imm(8*16), rax) // increment pointers to allow byte
add(imm(8*16), rbx) // offsets in the unrolled iterations.
@@ -106,7 +105,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.CONSIDERKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -253,7 +252,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
label(.CONSIDERKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.POSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -310,7 +309,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
mov(%5, rbx) // load address of b11.
mov(var(b11), rbx) // load address of b11.
// xmm8: xmm9: xmm10: xmm11:
// ( ab01 ( ab00 ( ab03 ( ab02
@@ -344,7 +343,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
// xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
// xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
mov(%9, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
movddup(mem(rax), xmm15) // load alpha and duplicate
movaps(mem(rbx, 0*16), xmm8)
@@ -383,11 +382,11 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
label(.TRSM)
mov(%3, rax) // load address of a11
mov(%6, rcx) // load address of c11
mov(var(a11), rax) // load address of a11
mov(var(c11), rcx) // load address of c11
mov(%7, rsi) // load rs_c
mov(%8, rdi) // load cs_c
mov(var(rs_c), rsi) // load rs_c
mov(var(cs_c), rdi) // load cs_c
sal(imm(3), rsi) // rs_c *= sizeof( double )
sal(imm(3), rdi) // cs_c *= sizeof( double )
@@ -504,20 +503,20 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a12), // 2
"m" (a11), // 3
"m" (b21), // 4
"m" (b11), // 5
"m" (c11), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (alpha), // 9
"m" (b_next) // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a12] "m" (a12), // 2
[a11] "m" (a11), // 3
[b21] "m" (b21), // 4
[b11] "m" (b11), // 5
[c11] "m" (c11), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c), // 8
[alpha] "m" (alpha), // 9
[b_next] "m" (b_next) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"xmm0", "xmm1", "xmm2", "xmm3",
@@ -525,7 +524,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}

View File

@@ -64,10 +64,9 @@ void bli_dtrsm_l_penryn_asm_4x4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%1, rbx) // load address of b11.
mov(var(b11), rbx) // load address of b11.
movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 )
movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 )
@@ -80,11 +79,11 @@ void bli_dtrsm_l_penryn_asm_4x4
mov(%0, rax) // load address of a11
mov(%2, rcx) // load address of c11
mov(var(a11), rax) // load address of a11
mov(var(c11), rcx) // load address of c11
mov(%3, rsi) // load rs_c
mov(%4, rdi) // load cs_c
mov(var(rs_c), rsi) // load rs_c
mov(var(cs_c), rdi) // load cs_c
sal(imm(3), rsi) // rs_c *= sizeof( double )
sal(imm(3), rdi) // cs_c *= sizeof( double )
@@ -199,13 +198,14 @@ void bli_dtrsm_l_penryn_asm_4x4
end_asm(
: // output operands (none)
: // input operands
"m" (a11), // 0
"m" (b11), // 1
"m" (c11), // 2
"m" (rs_c), // 3
"m" (cs_c) // 4
[a11] "m" (a11), // 0
[b11] "m" (b11), // 1
[c11] "m" (c11), // 2
[rs_c] "m" (rs_c), // 3
[cs_c] "m" (cs_c) // 4
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10",
"xmm0", "xmm1", "xmm2", "xmm3",
@@ -213,7 +213,7 @@ void bli_dtrsm_l_penryn_asm_4x4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}

View File

@@ -64,10 +64,9 @@ void bli_dtrsm_u_penryn_asm_4x4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%1, rbx) // load address of b11.
mov(var(b11), rbx) // load address of b11.
movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 )
movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 )
@@ -80,11 +79,11 @@ void bli_dtrsm_u_penryn_asm_4x4
mov(%0, rax) // load address of a11
mov(%2, rcx) // load address of c11
mov(var(a11), rax) // load address of a11
mov(var(c11), rcx) // load address of c11
mov(%3, rsi) // load rs_c
mov(%4, rdi) // load cs_c
mov(var(rs_c), rsi) // load rs_c
mov(var(cs_c), rdi) // load cs_c
sal(imm(3), rsi) // rs_c *= sizeof( double )
sal(imm(3), rdi) // cs_c *= sizeof( double )
@@ -202,13 +201,14 @@ void bli_dtrsm_u_penryn_asm_4x4
end_asm(
: // output operands (none)
: // input operands
"m" (a11), // 0
"m" (b11), // 1
"m" (c11), // 2
"m" (rs_c), // 3
"m" (cs_c) // 4
[a11] "m" (a11), // 0
[b11] "m" (b11), // 1
[c11] "m" (c11), // 2
[rs_c] "m" (rs_c), // 3
[cs_c] "m" (cs_c) // 4
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"xmm0", "xmm1", "xmm2", "xmm3",
@@ -216,7 +216,7 @@ void bli_dtrsm_u_penryn_asm_4x4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}

View File

@@ -62,14 +62,13 @@ void bli_sgemm_piledriver_asm_16x3
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(%9, r15) // load address of b_next.
mov(%10, r14) // load address of a_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
mov(var(b_next), r15) // load address of b_next.
mov(var(a_next), r14) // load address of a_next.
prefetch(0, mem(rbx, 128)) // prefetch b
prefetch(0, mem(rbx, 64+128)) // prefetch b
@@ -78,8 +77,8 @@ void bli_sgemm_piledriver_asm_16x3
add(imm(32*4), rax)
add(imm(12*4), rbx)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c;
@@ -103,7 +102,7 @@ void bli_sgemm_piledriver_asm_16x3
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.SCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -313,7 +312,7 @@ void bli_sgemm_piledriver_asm_16x3
label(.SCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -393,8 +392,8 @@ void bli_sgemm_piledriver_asm_16x3
mov(%4, rax) // load address of alpha
mov(%5, rbx) // load address of beta
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastss(mem(rax), xmm0) // load alpha and duplicate
vbroadcastss(mem(rbx), xmm2) // load beta and duplicate
@@ -419,7 +418,7 @@ void bli_sgemm_piledriver_asm_16x3
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
//lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
@@ -887,19 +886,20 @@ void bli_sgemm_piledriver_asm_16x3
label(.SDONE)
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (b_next), // 9
"m" (a_next) // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c), // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -908,7 +908,7 @@ void bli_sgemm_piledriver_asm_16x3
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
void bli_dgemm_piledriver_asm_8x3
@@ -933,14 +933,13 @@ void bli_dgemm_piledriver_asm_8x3
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(%9, r15) // load address of b_next.
mov(%10, r14) // load address of a_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
mov(var(b_next), r15) // load address of b_next.
mov(var(a_next), r14) // load address of a_next.
prefetch(0, mem(rbx, 128)) // prefetch b
prefetch(0, mem(rbx, 64+128)) // prefetch b
@@ -949,8 +948,8 @@ void bli_dgemm_piledriver_asm_8x3
add(imm(16*8), rax)
add(imm(12*8), rbx)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c;
@@ -974,7 +973,7 @@ void bli_dgemm_piledriver_asm_8x3
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -1183,7 +1182,7 @@ void bli_dgemm_piledriver_asm_8x3
label(.DCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done.
// else, we prepare to
@@ -1253,8 +1252,8 @@ void bli_dgemm_piledriver_asm_8x3
mov(%4, rax) // load address of alpha
mov(%5, rbx) // load address of beta
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vmovddup(mem(rax), xmm0) // load alpha and duplicate
vmovddup(mem(rbx), xmm2) // load beta and duplicate
@@ -1278,7 +1277,7 @@ void bli_dgemm_piledriver_asm_8x3
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
@@ -1606,19 +1605,20 @@ void bli_dgemm_piledriver_asm_8x3
label(.DDONE)
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (b_next), // 9
"m" (a_next) // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c), // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -1627,7 +1627,7 @@ void bli_dgemm_piledriver_asm_8x3
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
void bli_cgemm_piledriver_asm_4x2
@@ -1652,17 +1652,16 @@ void bli_cgemm_piledriver_asm_4x2
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(%9, r15) // load address of b_next.
mov(%10, r14) // load address of a_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
mov(var(b_next), r15) // load address of b_next.
mov(var(a_next), r14) // load address of a_next.
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
@@ -1682,7 +1681,7 @@ void bli_cgemm_piledriver_asm_4x2
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.CCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -1848,7 +1847,7 @@ void bli_cgemm_piledriver_asm_4x2
label(.CCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -1925,7 +1924,7 @@ void bli_cgemm_piledriver_asm_4x2
// scale by alpha
mov(%4, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
vbroadcastss(mem(rax), xmm0) // load alpha_r and duplicate
vbroadcastss(mem(rax, 4), xmm1) // load alpha_i and duplicate
@@ -1952,7 +1951,7 @@ void bli_cgemm_piledriver_asm_4x2
mov(%5, rbx) // load address of beta
mov(var(beta), rbx) // load address of beta
vbroadcastss(mem(rbx), xmm6) // load beta_r and duplicate
vbroadcastss(mem(rbx, 4), xmm7) // load beta_i and duplicate
@@ -1962,7 +1961,7 @@ void bli_cgemm_piledriver_asm_4x2
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
@@ -2154,19 +2153,20 @@ void bli_cgemm_piledriver_asm_4x2
label(.CDONE)
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (b_next), // 9
"m" (a_next) // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c), // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -2175,7 +2175,7 @@ void bli_cgemm_piledriver_asm_4x2
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
void bli_zgemm_piledriver_asm_2x2
@@ -2200,17 +2200,16 @@ void bli_zgemm_piledriver_asm_2x2
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(%9, r15) // load address of b_next.
mov(%10, r14) // load address of a_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
mov(var(b_next), r15) // load address of b_next.
mov(var(a_next), r14) // load address of a_next.
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
lea(mem(, rdi, 2), rdi)
lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c;
@@ -2229,7 +2228,7 @@ void bli_zgemm_piledriver_asm_2x2
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.ZCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -2399,7 +2398,7 @@ void bli_zgemm_piledriver_asm_2x2
label(.ZCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -2473,7 +2472,7 @@ void bli_zgemm_piledriver_asm_2x2
// scale by alpha
mov(%4, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
vmovddup(mem(rax), xmm0) // load alpha_r and duplicate
vmovddup(mem(rax, 8), xmm1) // load alpha_i and duplicate
@@ -2500,7 +2499,7 @@ void bli_zgemm_piledriver_asm_2x2
mov(%5, rbx) // load address of beta
mov(var(beta), rbx) // load address of beta
vmovddup(mem(rbx), xmm6) // load beta_r and duplicate
vmovddup(mem(rbx, 8), xmm7) // load beta_i and duplicate
@@ -2510,7 +2509,7 @@ void bli_zgemm_piledriver_asm_2x2
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
lea(mem(, rsi, 2), rsi)
//lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c;
@@ -2688,19 +2687,20 @@ void bli_zgemm_piledriver_asm_2x2
label(.ZDONE)
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (b_next), // 9
"m" (a_next) // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c), // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -2709,7 +2709,7 @@ void bli_zgemm_piledriver_asm_2x2
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}

View File

@@ -62,20 +62,19 @@ void bli_sgemm_sandybridge_asm_8x8
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
//mov(%9, r15) // load address of b_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
//mov(var(b_next), r15) // load address of b_next.
vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b.
vpermilps(imm(0x4e), ymm2, ymm3)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c;
@@ -100,7 +99,7 @@ void bli_sgemm_sandybridge_asm_8x8
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.SCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -248,7 +247,7 @@ void bli_sgemm_sandybridge_asm_8x8
label(.SCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -393,8 +392,8 @@ void bli_sgemm_sandybridge_asm_8x8
mov(%4, rax) // load address of alpha
mov(%5, rbx) // load address of beta
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
vbroadcastss(mem(rbx), ymm4) // load beta and duplicate
@@ -412,7 +411,7 @@ void bli_sgemm_sandybridge_asm_8x8
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
@@ -1002,19 +1001,20 @@ void bli_sgemm_sandybridge_asm_8x8
vzeroupper()
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -1023,7 +1023,7 @@ void bli_sgemm_sandybridge_asm_8x8
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
void bli_dgemm_sandybridge_asm_8x4
@@ -1048,22 +1048,21 @@ void bli_dgemm_sandybridge_asm_8x4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(%9, r15) // load address of b_next.
//mov(%10, r14) // load address of a_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
mov(var(b_next), r15) // load address of b_next.
//mov(var(a_next), r14) // load address of a_next.
sub(imm(4*64), r15)
vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
vmovapd(mem(rbx, 0*32), ymm2) // elements of a and b.
vpermilpd(imm(0x5), ymm2, ymm3)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
@@ -1083,7 +1082,7 @@ void bli_dgemm_sandybridge_asm_8x4
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -1228,7 +1227,7 @@ void bli_dgemm_sandybridge_asm_8x4
label(.DCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -1343,8 +1342,8 @@ void bli_dgemm_sandybridge_asm_8x4
// ab70 ) ab71 ) ab72 ) ab73 )
mov(%4, rax) // load address of alpha
mov(%5, rbx) // load address of beta
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
vbroadcastsd(mem(rbx), ymm2) // load beta and duplicate
@@ -1362,7 +1361,7 @@ void bli_dgemm_sandybridge_asm_8x4
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
@@ -1677,19 +1676,20 @@ void bli_dgemm_sandybridge_asm_8x4
vzeroupper()
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (b_next)/*, // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c), // 8
[b_next] "m" (b_next)/*, // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -1698,7 +1698,7 @@ void bli_dgemm_sandybridge_asm_8x4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
void bli_cgemm_sandybridge_asm_8x4
@@ -1723,22 +1723,21 @@ void bli_cgemm_sandybridge_asm_8x4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(%9, r15) // load address of b_next.
//mov(%10, r14) // load address of a_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
mov(var(b_next), r15) // load address of b_next.
//mov(var(a_next), r14) // load address of a_next.
sub(imm(4*64), r15)
vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
vmovsldup(mem(rbx, 0*32), ymm2)
vpermilps(imm(0x4e), ymm2, ymm3)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
@@ -1758,7 +1757,7 @@ void bli_cgemm_sandybridge_asm_8x4
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.CCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -2004,7 +2003,7 @@ void bli_cgemm_sandybridge_asm_8x4
label(.CCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -2175,7 +2174,7 @@ void bli_cgemm_sandybridge_asm_8x4
// scale by alpha
mov(%4, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate
vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate
@@ -2222,7 +2221,7 @@ void bli_cgemm_sandybridge_asm_8x4
mov(%5, rbx) // load address of beta
mov(var(beta), rbx) // load address of beta
vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate
vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate
@@ -2232,7 +2231,7 @@ void bli_cgemm_sandybridge_asm_8x4
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
@@ -2638,19 +2637,20 @@ void bli_cgemm_sandybridge_asm_8x4
vzeroupper()
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c), // 8
"m" (b_next)/*, // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c), // 8
[b_next] "m" (b_next)/*, // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -2659,7 +2659,7 @@ void bli_cgemm_sandybridge_asm_8x4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
@@ -2686,21 +2686,20 @@ void bli_zgemm_sandybridge_asm_4x4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
//mov(%9, r15) // load address of b_next.
//mov(%10, r14) // load address of a_next.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
//mov(var(b_next), r15) // load address of b_next.
//mov(var(a_next), r14) // load address of a_next.
vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
vmovddup(mem(rbx, 0+0*32), ymm2)
vmovddup(mem(rbx, 0+1*32), ymm3)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
lea(mem(, rdi, 2), rdi)
lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
@@ -2721,7 +2720,7 @@ void bli_zgemm_sandybridge_asm_4x4
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.ZCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -2964,7 +2963,7 @@ void bli_zgemm_sandybridge_asm_4x4
label(.ZCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -3083,7 +3082,7 @@ void bli_zgemm_sandybridge_asm_4x4
// scale by alpha
mov(%4, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate
vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate
@@ -3130,7 +3129,7 @@ void bli_zgemm_sandybridge_asm_4x4
mov(%5, rbx) // load address of beta
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate
vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate
@@ -3140,7 +3139,7 @@ void bli_zgemm_sandybridge_asm_4x4
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
lea(mem(, rsi, 2), rsi)
lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c;
@@ -3488,19 +3487,20 @@ void bli_zgemm_sandybridge_asm_4x4
vzeroupper()
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -3509,7 +3509,7 @@ void bli_zgemm_sandybridge_asm_4x4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}

View File

@@ -306,7 +306,7 @@ void bli_dgemm_skx_asm_16x12_l2(
const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_;
BEGIN_ASM
BEGIN_ASM()
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
VMOVAPD(YMM( 7), YMM(8))

View File

@@ -336,7 +336,7 @@ void bli_sgemm_skx_asm_32x12_l2(
const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_;
BEGIN_ASM
BEGIN_ASM()
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
VMOVAPD(YMM( 7), YMM(8))

View File

@@ -99,14 +99,13 @@ void bli_sgemm_zen_asm_6x16
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
//mov(%9, r15) // load address of b_next.
add(imm(32*4), rbx)
@@ -114,8 +113,8 @@ void bli_sgemm_zen_asm_6x16
vmovaps(mem(rbx, -4*32), ymm0)
vmovaps(mem(rbx, -3*32), ymm1)
mov(%6, rcx) // load address of c
mov(%7, rdi) // load rs_c
mov(var(c), rcx) // load address of c
mov(var(rs_c), rdi) // load rs_c
lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float)
lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c;
@@ -130,7 +129,7 @@ void bli_sgemm_zen_asm_6x16
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.SCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -257,7 +256,7 @@ void bli_sgemm_zen_asm_6x16
label(.SCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -305,8 +304,8 @@ void bli_sgemm_zen_asm_6x16
mov(%4, rax) // load address of alpha
mov(%5, rbx) // load address of beta
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
vbroadcastss(mem(rbx), ymm3) // load beta and duplicate
@@ -328,7 +327,7 @@ void bli_sgemm_zen_asm_6x16
mov(%8, rsi) // load cs_c
mov(var(cs_c), rsi) // load cs_c
lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float)
lea(mem(rcx, rsi, 8), rdx) // load address of c + 8*cs_c;
@@ -872,19 +871,20 @@ void bli_sgemm_zen_asm_6x16
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -893,7 +893,7 @@ void bli_sgemm_zen_asm_6x16
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
@@ -945,14 +945,13 @@ void bli_dgemm_zen_asm_6x8
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
//mov(%9, r15) // load address of b_next.
add(imm(32*4), rbx)
@@ -960,8 +959,8 @@ void bli_dgemm_zen_asm_6x8
vmovapd(mem(rbx, -4*32), ymm0)
vmovapd(mem(rbx, -3*32), ymm1)
mov(%6, rcx) // load address of c
mov(%7, rdi) // load rs_c
mov(var(c), rcx) // load address of c
mov(var(rs_c), rdi) // load rs_c
lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double)
lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c;
@@ -976,7 +975,7 @@ void bli_dgemm_zen_asm_6x8
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -1103,7 +1102,7 @@ void bli_dgemm_zen_asm_6x8
label(.DCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -1151,8 +1150,8 @@ void bli_dgemm_zen_asm_6x8
mov(%4, rax) // load address of alpha
mov(%5, rbx) // load address of beta
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
@@ -1174,7 +1173,7 @@ void bli_dgemm_zen_asm_6x8
mov(%8, rsi) // load cs_c
mov(var(cs_c), rsi) // load cs_c
lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double)
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c;
@@ -1611,19 +1610,20 @@ void bli_dgemm_zen_asm_6x8
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -1632,7 +1632,7 @@ void bli_dgemm_zen_asm_6x8
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
@@ -1691,14 +1691,13 @@ void bli_cgemm_zen_asm_3x8
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
//mov(%9, r15) // load address of b_next.
add(imm(32*4), rbx)
@@ -1706,8 +1705,8 @@ void bli_cgemm_zen_asm_3x8
vmovaps(mem(rbx, -4*32), ymm0)
vmovaps(mem(rbx, -3*32), ymm1)
mov(%6, rcx) // load address of c
mov(%7, rdi) // load rs_c
mov(var(c), rcx) // load address of c
mov(var(rs_c), rdi) // load rs_c
lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(scomplex)
lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*rs_c;
@@ -1720,7 +1719,7 @@ void bli_cgemm_zen_asm_3x8
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.CCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -1847,7 +1846,7 @@ void bli_cgemm_zen_asm_3x8
label(.CCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -1916,7 +1915,7 @@ void bli_cgemm_zen_asm_3x8
mov(%4, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate
vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate
@@ -1957,14 +1956,14 @@ void bli_cgemm_zen_asm_3x8
mov(%5, rbx) // load address of beta
mov(var(beta), rbx) // load address of beta
vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate
vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate
mov(%8, rsi) // load cs_c
mov(var(cs_c), rsi) // load cs_c
lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(scomplex)
lea(mem(, rsi, 4), rdx) // rdx = 4*cs_c;
lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c;
@@ -2143,19 +2142,20 @@ void bli_cgemm_zen_asm_3x8
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -2164,7 +2164,7 @@ void bli_cgemm_zen_asm_3x8
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
@@ -2219,14 +2219,13 @@ void bli_zgemm_zen_asm_3x4
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
//mov(%9, r15) // load address of b_next.
add(imm(32*4), rbx)
@@ -2234,8 +2233,8 @@ void bli_zgemm_zen_asm_3x4
vmovapd(mem(rbx, -4*32), ymm0)
vmovapd(mem(rbx, -3*32), ymm1)
mov(%6, rcx) // load address of c
mov(%7, rdi) // load rs_c
mov(var(c), rcx) // load address of c
mov(var(rs_c), rdi) // load rs_c
lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dcomplex)
lea(mem(, rdi, 2), rdi)
@@ -2249,7 +2248,7 @@ void bli_zgemm_zen_asm_3x4
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.ZCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -2376,7 +2375,7 @@ void bli_zgemm_zen_asm_3x4
label(.ZCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -2444,7 +2443,7 @@ void bli_zgemm_zen_asm_3x4
mov(%4, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate
vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate
@@ -2485,14 +2484,14 @@ void bli_zgemm_zen_asm_3x4
mov(%5, rbx) // load address of beta
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
mov(%8, rsi) // load cs_c
mov(var(cs_c), rsi) // load cs_c
lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dcomplex)
lea(mem(, rsi, 2), rsi)
lea(mem(, rsi, 2), rdx) // rdx = 2*cs_c;
@@ -2671,19 +2670,20 @@ void bli_zgemm_zen_asm_3x4
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -2692,7 +2692,7 @@ void bli_zgemm_zen_asm_3x4
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}

View File

@@ -98,14 +98,13 @@ void bli_sgemm_zen_asm_16x6
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
//mov(%9, r15) // load address of b_next.
add(imm(32*4), rax)
@@ -113,8 +112,8 @@ void bli_sgemm_zen_asm_16x6
vmovaps(mem(rax, -4*32), ymm0)
vmovaps(mem(rax, -3*32), ymm1)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c;
@@ -129,7 +128,7 @@ void bli_sgemm_zen_asm_16x6
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.SCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -256,7 +255,7 @@ void bli_sgemm_zen_asm_16x6
label(.SCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -304,8 +303,8 @@ void bli_sgemm_zen_asm_16x6
mov(%4, rax) // load address of alpha
mov(%5, rbx) // load address of beta
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
vbroadcastss(mem(rbx), ymm3) // load beta and duplicate
@@ -327,7 +326,7 @@ void bli_sgemm_zen_asm_16x6
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
lea(mem(rcx, rsi, 8), rdx) // load address of c + 8*rs_c;
@@ -614,19 +613,20 @@ void bli_sgemm_zen_asm_16x6
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -635,7 +635,7 @@ void bli_sgemm_zen_asm_16x6
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
#define DGEMM_INPUT_GS_BETA_NZ \
@@ -684,14 +684,13 @@ void bli_dgemm_zen_asm_8x6
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
//mov(%9, r15) // load address of b_next.
add(imm(32*4), rax)
@@ -699,8 +698,8 @@ void bli_dgemm_zen_asm_8x6
vmovapd(mem(rax, -4*32), ymm0)
vmovapd(mem(rax, -3*32), ymm1)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c;
@@ -715,7 +714,7 @@ void bli_dgemm_zen_asm_8x6
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -842,7 +841,7 @@ void bli_dgemm_zen_asm_8x6
label(.DCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -890,8 +889,8 @@ void bli_dgemm_zen_asm_8x6
mov(%4, rax) // load address of alpha
mov(%5, rbx) // load address of beta
mov(var(alpha), rax) // load address of alpha
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate
vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate
@@ -913,7 +912,7 @@ void bli_dgemm_zen_asm_8x6
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double)
lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
@@ -1199,19 +1198,20 @@ void bli_dgemm_zen_asm_8x6
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -1220,7 +1220,7 @@ void bli_dgemm_zen_asm_8x6
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
@@ -1279,14 +1279,13 @@ void bli_cgemm_zen_asm_8x3
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
//mov(%9, r15) // load address of b_next.
add(imm(32*4), rax)
@@ -1294,8 +1293,8 @@ void bli_cgemm_zen_asm_8x3
vmovaps(mem(rax, -4*32), ymm0)
vmovaps(mem(rax, -3*32), ymm1)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*cs_c;
@@ -1308,7 +1307,7 @@ void bli_cgemm_zen_asm_8x3
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.CCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -1435,7 +1434,7 @@ void bli_cgemm_zen_asm_8x3
label(.CCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -1504,7 +1503,7 @@ void bli_cgemm_zen_asm_8x3
mov(%4, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate
vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate
@@ -1545,14 +1544,14 @@ void bli_cgemm_zen_asm_8x3
mov(%5, rbx) // load address of beta
mov(var(beta), rbx) // load address of beta
vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate
vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
lea(mem(, rsi, 4), rdx) // rdx = 4*rs_c;
lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c;
@@ -1731,19 +1730,20 @@ void bli_cgemm_zen_asm_8x3
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -1752,7 +1752,7 @@ void bli_cgemm_zen_asm_8x3
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
@@ -1807,14 +1807,13 @@ void bli_zgemm_zen_asm_4x3
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a), rax) // load address of a.
mov(var(b), rbx) // load address of b.
//mov(%9, r15) // load address of b_next.
add(imm(32*4), rax)
@@ -1822,8 +1821,8 @@ void bli_zgemm_zen_asm_4x3
vmovapd(mem(rax, -4*32), ymm0)
vmovapd(mem(rax, -3*32), ymm1)
mov(%6, rcx) // load address of c
mov(%8, rdi) // load cs_c
mov(var(c), rcx) // load address of c
mov(var(cs_c), rdi) // load cs_c
lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
lea(mem(, rdi, 2), rdi)
@@ -1837,7 +1836,7 @@ void bli_zgemm_zen_asm_4x3
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.ZCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -1964,7 +1963,7 @@ void bli_zgemm_zen_asm_4x3
label(.ZCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -2032,7 +2031,7 @@ void bli_zgemm_zen_asm_4x3
mov(%4, rax) // load address of alpha
mov(var(alpha), rax) // load address of alpha
vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate
vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate
@@ -2073,14 +2072,14 @@ void bli_zgemm_zen_asm_4x3
mov(%5, rbx) // load address of beta
mov(var(beta), rbx) // load address of beta
vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
mov(%7, rsi) // load rs_c
mov(var(rs_c), rsi) // load rs_c
lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
lea(mem(, rsi, 2), rsi)
lea(mem(, rsi, 2), rdx) // rdx = 2*rs_c;
@@ -2259,19 +2258,20 @@ void bli_zgemm_zen_asm_4x3
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a] "m" (a), // 2
[b] "m" (b), // 3
[alpha] "m" (alpha), // 4
[beta] "m" (beta), // 5
[c] "m" (c), // 6
[rs_c] "m" (rs_c), // 7
[cs_c] "m" (cs_c)/*, // 8
[b_next] "m" (b_next), // 9
[a_next] "m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -2280,7 +2280,7 @@ void bli_zgemm_zen_asm_4x3
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}

View File

@@ -81,36 +81,35 @@ void bli_sgemmtrsm_l_zen_asm_6x16
float* beta = bli_sm1;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a10), rax) // load address of a.
mov(var(b01), rbx) // load address of b.
add(imm(32*4), rbx)
// initialize loop by pre-loading
vmovaps(mem(rbx, -4*32), ymm0)
vmovaps(mem(rbx, -3*32), ymm1)
mov(%7, rcx) // load address of b11
mov(var(b11), rcx) // load address of b11
mov(imm(16), rdi) // set rs_b = PACKNR = 16
lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float)
// NOTE: c11, rs_c, and cs_c aren't
// needed for a while, but we load
// them now to avoid stalling later.
mov(%8, r8) // load address of c11
mov(%9, r9) // load rs_c
mov(var(c11), r8) // load address of c11
mov(var(rs_c), r9) // load rs_c
lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float)
mov(%10, r10) // load cs_c
mov(var(k_left)0, r10) // load cs_c
lea(mem(, r10, 4), r10) // cs_c *= sizeof(float)
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.SCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -237,7 +236,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16
label(.SCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -286,7 +285,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16
mov(%5, rbx) // load address of alpha
mov(var(alpha), rbx) // load address of alpha
vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate
@@ -365,7 +364,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16
// ymm14 ymm15 = ( beta50..57 ) ( beta58..5F )
mov(%6, rax) // load address of a11
mov(var(a11), rax) // load address of a11
mov(r11, rcx) // recall address of b11
mov(r14, rdx) // recall address of b11+8*cs_b
@@ -772,19 +771,20 @@ void bli_sgemmtrsm_l_zen_asm_6x16
vzeroupper()
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a10), // 2
"m" (b01), // 3
"m" (beta), // 4
"m" (alpha), // 5
"m" (a11), // 6
"m" (b11), // 7
"m" (c11), // 8
"m" (rs_c), // 9
"m" (cs_c) // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a10] "m" (a10), // 2
[b01] "m" (b01), // 3
[beta] "m" (beta), // 4
[alpha] "m" (alpha), // 5
[a11] "m" (a11), // 6
[b11] "m" (b11), // 7
[c11] "m" (c11), // 8
[rs_c] "m" (rs_c), // 9
[cs_c] "m" (cs_c) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -793,7 +793,7 @@ void bli_sgemmtrsm_l_zen_asm_6x16
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
@@ -835,36 +835,35 @@ void bli_dgemmtrsm_l_zen_asm_6x8
double* beta = bli_dm1;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a10), rax) // load address of a.
mov(var(b01), rbx) // load address of b.
add(imm(32*4), rbx)
// initialize loop by pre-loading
vmovapd(mem(rbx, -4*32), ymm0)
vmovapd(mem(rbx, -3*32), ymm1)
mov(%7, rcx) // load address of b11
mov(var(b11), rcx) // load address of b11
mov(imm(8), rdi) // set rs_b = PACKNR = 8
lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double)
// NOTE: c11, rs_c, and cs_c aren't
// needed for a while, but we load
// them now to avoid stalling later.
mov(%8, r8) // load address of c11
mov(%9, r9) // load rs_c
mov(var(c11), r8) // load address of c11
mov(var(rs_c), r9) // load rs_c
lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double)
mov(%10, r10) // load cs_c
mov(var(k_left)0, r10) // load cs_c
lea(mem(, r10, 8), r10) // cs_c *= sizeof(double)
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -991,7 +990,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8
label(.DCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -1041,7 +1040,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8
mov(%5, rbx) // load address of alpha
mov(var(alpha), rbx) // load address of alpha
vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate
@@ -1120,7 +1119,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8
// ymm14 ymm15 = ( beta50..53 ) ( beta54..57 )
mov(%6, rax) // load address of a11
mov(var(a11), rax) // load address of a11
mov(r11, rcx) // recall address of b11
mov(r14, rdx) // recall address of b11+4*cs_b
@@ -1488,19 +1487,20 @@ void bli_dgemmtrsm_l_zen_asm_6x8
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a10), // 2
"m" (b01), // 3
"m" (beta), // 4
"m" (alpha), // 5
"m" (a11), // 6
"m" (b11), // 7
"m" (c11), // 8
"m" (rs_c), // 9
"m" (cs_c) // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a10] "m" (a10), // 2
[b01] "m" (b01), // 3
[beta] "m" (beta), // 4
[alpha] "m" (alpha), // 5
[a11] "m" (a11), // 6
[b11] "m" (b11), // 7
[c11] "m" (c11), // 8
[rs_c] "m" (rs_c), // 9
[cs_c] "m" (cs_c) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -1509,7 +1509,7 @@ void bli_dgemmtrsm_l_zen_asm_6x8
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}

View File

@@ -81,36 +81,35 @@ void bli_sgemmtrsm_u_zen_asm_6x16
float* beta = bli_sm1;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a10), rax) // load address of a.
mov(var(b01), rbx) // load address of b.
add(imm(32*4), rbx)
// initialize loop by pre-loading
vmovaps(mem(rbx, -4*32), ymm0)
vmovaps(mem(rbx, -3*32), ymm1)
mov(%7, rcx) // load address of b11
mov(var(b11), rcx) // load address of b11
mov(imm(16), rdi) // set rs_b = PACKNR = 16
lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float)
// NOTE: c11, rs_c, and cs_c aren't
// needed for a while, but we load
// them now to avoid stalling later.
mov(%8, r8) // load address of c11
mov(%9, r9) // load rs_c
mov(var(c11), r8) // load address of c11
mov(var(rs_c), r9) // load rs_c
lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float)
mov(%10, r10) // load cs_c
mov(var(k_left)0, r10) // load cs_c
lea(mem(, r10, 4), r10) // cs_c *= sizeof(float)
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.SCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -237,7 +236,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16
label(.SCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -286,7 +285,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16
mov(%5, rbx) // load address of alpha
mov(var(alpha), rbx) // load address of alpha
vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate
@@ -365,7 +364,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16
// ymm14 ymm15 = ( beta50..57 ) ( beta58..5F )
mov(%6, rax) // load address of a11
mov(var(a11), rax) // load address of a11
mov(r11, rcx) // recall address of b11
mov(r14, rdx) // recall address of b11+8*cs_b
@@ -776,20 +775,20 @@ void bli_sgemmtrsm_u_zen_asm_6x16
vzeroupper()
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a10), // 2
"m" (b01), // 3
"m" (beta), // 4
"m" (alpha), // 5
"m" (a11), // 6
"m" (b11), // 7
"m" (c11), // 8
"m" (rs_c), // 9
"m" (cs_c) // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a10] "m" (a10), // 2
[b01] "m" (b01), // 3
[beta] "m" (beta), // 4
[alpha] "m" (alpha), // 5
[a11] "m" (a11), // 6
[b11] "m" (b11), // 7
[c11] "m" (c11), // 8
[rs_c] "m" (rs_c), // 9
[cs_c] "m" (cs_c) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -798,7 +797,7 @@ void bli_sgemmtrsm_u_zen_asm_6x16
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}
@@ -840,36 +839,35 @@ void bli_dgemmtrsm_u_zen_asm_6x8
double* beta = bli_dm1;
__asm__ volatile
(
begin_asm()
vzeroall() // zero all xmm/ymm registers.
mov(%2, rax) // load address of a.
mov(%3, rbx) // load address of b.
mov(var(a10), rax) // load address of a.
mov(var(b01), rbx) // load address of b.
add(imm(32*4), rbx)
// initialize loop by pre-loading
vmovapd(mem(rbx, -4*32), ymm0)
vmovapd(mem(rbx, -3*32), ymm1)
mov(%7, rcx) // load address of b11
mov(var(b11), rcx) // load address of b11
mov(imm(8), rdi) // set rs_b = PACKNR = 8
lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double)
// NOTE: c11, rs_c, and cs_c aren't
// needed for a while, but we load
// them now to avoid stalling later.
mov(%8, r8) // load address of c11
mov(%9, r9) // load rs_c
mov(var(c11), r8) // load address of c11
mov(var(rs_c), r9) // load rs_c
lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double)
mov(%10, r10) // load cs_c
mov(var(k_left)0, r10) // load cs_c
lea(mem(, r10, 8), r10) // cs_c *= sizeof(double)
mov(%0, rsi) // i = k_iter;
mov(var(k_iter), rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.DCONSIDKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
@@ -996,7 +994,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8
label(.DCONSIDKLEFT)
mov(%1, rsi) // i = k_left;
mov(var(k_left), rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.DPOSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
@@ -1046,7 +1044,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8
mov(%5, rbx) // load address of alpha
mov(var(alpha), rbx) // load address of alpha
vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate
@@ -1125,7 +1123,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8
// ymm14 ymm15 = ( beta50..53 ) ( beta54..57 )
mov(%6, rax) // load address of a11
mov(var(a11), rax) // load address of a11
mov(r11, rcx) // recall address of b11
mov(r14, rdx) // recall address of b11+4*cs_b
@@ -1497,20 +1495,20 @@ void bli_dgemmtrsm_u_zen_asm_6x8
vzeroupper()
end_asm(
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a10), // 2
"m" (b01), // 3
"m" (beta), // 4
"m" (alpha), // 5
"m" (a11), // 6
"m" (b11), // 7
"m" (c11), // 8
"m" (rs_c), // 9
"m" (cs_c) // 10
[k_iter] "m" (k_iter), // 0
[k_left] "m" (k_left), // 1
[a10] "m" (a10), // 2
[b01] "m" (b01), // 3
[beta] "m" (beta), // 4
[alpha] "m" (alpha), // 5
[a11] "m" (a11), // 6
[b11] "m" (b11), // 7
[c11] "m" (c11), // 8
[rs_c] "m" (rs_c), // 9
[cs_c] "m" (cs_c) // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
@@ -1519,7 +1517,7 @@ void bli_dgemmtrsm_u_zen_asm_6x8
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
)
}