Added single/real gemm micro-kernel for x86_64.

Details:
- Added a single-precision real gemm micro-kernel in
  kernels/x86_64/3/bli_gemm_opt_d4x4.c.
- Adjusted the single-precision real register blocksizes in
  config/clarksville/bli_kernel.h to be 8x4.
- Added a missing comment to bli_packm_blk_var2.c that was present in
  bli_packm_blk_var3.c
This commit is contained in:
Field G. Van Zee
2013-08-27 13:41:46 -05:00
parent dedda523dc
commit d352c746e5
3 changed files with 698 additions and 27 deletions

View File

@@ -54,7 +54,7 @@
// (b) NR (for triangular operations such as trmm and trsm).
//
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_MC_S 768
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 8192
@@ -82,8 +82,8 @@
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/2)
#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/2)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/2)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/2)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
@@ -100,7 +100,7 @@
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MR_D 4

View File

@@ -260,6 +260,9 @@ void PASTEMAC(ch,varname )( \
{ \
diagoffc_i_abs = bli_abs( diagoffc_i ); \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel, but we can probably still support those cases if
it happens. */ \
if ( ( bli_is_col_stored( rs_p, cs_p ) && diagoffc_i < 0 ) || \
( bli_is_row_stored( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \

View File

@@ -45,7 +45,675 @@ void bli_sgemm_opt_d4x4(
float* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
dim_t k_iter;
dim_t k_left;
k_iter = k / 4;
k_left = k % 4;
__asm__ volatile
(
" \n\t"
" \n\t"
"movq %2, %%rax \n\t" // load address of a.
"movq %3, %%rbx \n\t" // load address of b.
"movq %9, %%r9 \n\t" // load address of b_next.
" \n\t"
"subq $-8 * 16, %%rax \n\t" // increment pointers to allow byte
"subq $-8 * 16, %%rbx \n\t" // offsets in the unrolled iterations.
" \n\t"
"movaps -8 * 16(%%rax), %%xmm0 \n\t" // initialize loop by pre-loading elements
"movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b.
"movaps -8 * 16(%%rbx), %%xmm2 \n\t"
" \n\t"
"movq %6, %%rcx \n\t" // load address of c
"movq %8, %%rdi \n\t" // load cs_c
"leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float)
"leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c;
" \n\t"
"prefetcht2 0 * 4(%%r9) \n\t" // prefetch b_next
" \n\t"
"xorpd %%xmm3, %%xmm3 \n\t"
"xorpd %%xmm4, %%xmm4 \n\t"
"xorpd %%xmm5, %%xmm5 \n\t"
"xorpd %%xmm6, %%xmm6 \n\t"
" \n\t"
"prefetcht0 6 * 4(%%rcx) \n\t" // prefetch c + 0*cs_c
"xorpd %%xmm8, %%xmm8 \n\t"
"movaps %%xmm8, %%xmm9 \n\t"
"prefetcht0 6 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
"movaps %%xmm8, %%xmm10 \n\t"
"movaps %%xmm8, %%xmm11 \n\t"
"prefetcht0 6 * 4(%%r10) \n\t" // prefetch c + 2*cs_c
"movaps %%xmm8, %%xmm12 \n\t"
"movaps %%xmm8, %%xmm13 \n\t"
"prefetcht0 6 * 4(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
"movaps %%xmm8, %%xmm14 \n\t"
"movaps %%xmm8, %%xmm15 \n\t"
" \n\t"
" \n\t"
" \n\t"
"movq %0, %%rsi \n\t" // i = k_iter;
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
"je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that
" \n\t" // contains the k_left loop.
" \n\t"
" \n\t"
".SLOOPKITER: \n\t" // MAIN LOOP
" \n\t"
"prefetcht0 (4*35+1) * 8(%%rax) \n\t"
" \n\t"
"addps %%xmm6, %%xmm10 \n\t" // iteration 0
"addps %%xmm3, %%xmm14 \n\t"
"movaps %%xmm2, %%xmm3 \n\t"
"pshufd $0x39, %%xmm2, %%xmm7 \n\t"
"mulps %%xmm0, %%xmm2 \n\t"
"mulps %%xmm1, %%xmm3 \n\t"
" \n\t"
"addps %%xmm4, %%xmm11 \n\t"
"addps %%xmm5, %%xmm15 \n\t"
"movaps %%xmm7, %%xmm5 \n\t"
"pshufd $0x39, %%xmm7, %%xmm6 \n\t"
"mulps %%xmm0, %%xmm7 \n\t"
"mulps %%xmm1, %%xmm5 \n\t"
" \n\t"
"addps %%xmm2, %%xmm8 \n\t"
"movaps -7 * 16(%%rbx), %%xmm2 \n\t"
"addps %%xmm3, %%xmm12 \n\t"
"movaps %%xmm6, %%xmm3 \n\t"
"pshufd $0x39, %%xmm6, %%xmm4 \n\t"
"mulps %%xmm0, %%xmm6 \n\t"
"mulps %%xmm1, %%xmm3 \n\t"
" \n\t"
"addps %%xmm7, %%xmm9 \n\t"
"addps %%xmm5, %%xmm13 \n\t"
"movaps %%xmm4, %%xmm5 \n\t"
"mulps %%xmm0, %%xmm4 \n\t"
"movaps -6 * 16(%%rax), %%xmm0 \n\t"
"mulps %%xmm1, %%xmm5 \n\t"
"movaps -5 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
" \n\t"
"addps %%xmm6, %%xmm10 \n\t" // iteration 1
"addps %%xmm3, %%xmm14 \n\t"
"movaps %%xmm2, %%xmm3 \n\t"
"pshufd $0x39, %%xmm2, %%xmm7 \n\t"
"mulps %%xmm0, %%xmm2 \n\t"
"mulps %%xmm1, %%xmm3 \n\t"
" \n\t"
"addps %%xmm4, %%xmm11 \n\t"
"addps %%xmm5, %%xmm15 \n\t"
"movaps %%xmm7, %%xmm5 \n\t"
"pshufd $0x39, %%xmm7, %%xmm6 \n\t"
"mulps %%xmm0, %%xmm7 \n\t"
"mulps %%xmm1, %%xmm5 \n\t"
" \n\t"
"addps %%xmm2, %%xmm8 \n\t"
"movaps -6 * 16(%%rbx), %%xmm2 \n\t"
"addps %%xmm3, %%xmm12 \n\t"
"movaps %%xmm6, %%xmm3 \n\t"
"pshufd $0x39, %%xmm6, %%xmm4 \n\t"
"mulps %%xmm0, %%xmm6 \n\t"
"mulps %%xmm1, %%xmm3 \n\t"
" \n\t"
"addps %%xmm7, %%xmm9 \n\t"
"addps %%xmm5, %%xmm13 \n\t"
"movaps %%xmm4, %%xmm5 \n\t"
"mulps %%xmm0, %%xmm4 \n\t"
"movaps -4 * 16(%%rax), %%xmm0 \n\t"
"mulps %%xmm1, %%xmm5 \n\t"
"movaps -3 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
" \n\t"
"addps %%xmm6, %%xmm10 \n\t" // iteration 2
"addps %%xmm3, %%xmm14 \n\t"
"movaps %%xmm2, %%xmm3 \n\t"
"pshufd $0x39, %%xmm2, %%xmm7 \n\t"
"mulps %%xmm0, %%xmm2 \n\t"
"mulps %%xmm1, %%xmm3 \n\t"
" \n\t"
"addps %%xmm4, %%xmm11 \n\t"
"addps %%xmm5, %%xmm15 \n\t"
"movaps %%xmm7, %%xmm5 \n\t"
"pshufd $0x39, %%xmm7, %%xmm6 \n\t"
"mulps %%xmm0, %%xmm7 \n\t"
"mulps %%xmm1, %%xmm5 \n\t"
" \n\t"
"addps %%xmm2, %%xmm8 \n\t"
"movaps -5 * 16(%%rbx), %%xmm2 \n\t"
"addps %%xmm3, %%xmm12 \n\t"
"movaps %%xmm6, %%xmm3 \n\t"
"pshufd $0x39, %%xmm6, %%xmm4 \n\t"
"mulps %%xmm0, %%xmm6 \n\t"
"mulps %%xmm1, %%xmm3 \n\t"
" \n\t"
"addps %%xmm7, %%xmm9 \n\t"
"addps %%xmm5, %%xmm13 \n\t"
"movaps %%xmm4, %%xmm5 \n\t"
"mulps %%xmm0, %%xmm4 \n\t"
"movaps -2 * 16(%%rax), %%xmm0 \n\t"
"mulps %%xmm1, %%xmm5 \n\t"
"movaps -1 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
" \n\t"
"addps %%xmm6, %%xmm10 \n\t" // iteration 3
"addps %%xmm3, %%xmm14 \n\t"
"movaps %%xmm2, %%xmm3 \n\t"
"pshufd $0x39, %%xmm2, %%xmm7 \n\t"
"mulps %%xmm0, %%xmm2 \n\t"
"mulps %%xmm1, %%xmm3 \n\t"
" \n\t"
"subq $-4 * 8 * 4, %%rax \n\t" // a += 4*8 (unroll x mr)
" \n\t"
"addps %%xmm4, %%xmm11 \n\t"
"addps %%xmm5, %%xmm15 \n\t"
"movaps %%xmm7, %%xmm5 \n\t"
"pshufd $0x39, %%xmm7, %%xmm6 \n\t"
"mulps %%xmm0, %%xmm7 \n\t"
"mulps %%xmm1, %%xmm5 \n\t"
" \n\t"
"subq $-4 * 4 * 4, %%r9 \n\t" // b_next += 4*4 (unroll x nr)
" \n\t"
"addps %%xmm2, %%xmm8 \n\t"
"movaps -4 * 16(%%rbx), %%xmm2 \n\t"
"addps %%xmm3, %%xmm12 \n\t"
"movaps %%xmm6, %%xmm3 \n\t"
"pshufd $0x39, %%xmm6, %%xmm4 \n\t"
"mulps %%xmm0, %%xmm6 \n\t"
"mulps %%xmm1, %%xmm3 \n\t"
" \n\t"
"subq $-4 * 4 * 4, %%rbx \n\t" // b += 4*4 (unroll x nr)
" \n\t"
"addps %%xmm7, %%xmm9 \n\t"
"addps %%xmm5, %%xmm13 \n\t"
"movaps %%xmm4, %%xmm5 \n\t"
"mulps %%xmm0, %%xmm4 \n\t"
"movaps -8 * 16(%%rax), %%xmm0 \n\t"
"mulps %%xmm1, %%xmm5 \n\t"
"movaps -7 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
"prefetcht2 0 * 4(%%r9) \n\t" // prefetch b_next[0]
"prefetcht2 16 * 4(%%r9) \n\t" // prefetch b_next[16]
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1;
"jne .SLOOPKITER \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".SCONSIDKLEFT: \n\t"
" \n\t"
"movq %1, %%rsi \n\t" // i = k_left;
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
"je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
" \n\t" // else, we prepare to enter k_left loop.
" \n\t"
" \n\t"
".SLOOPKLEFT: \n\t" // EDGE LOOP
" \n\t"
"addps %%xmm6, %%xmm10 \n\t" // iteration 0
"addps %%xmm3, %%xmm14 \n\t"
"movaps %%xmm2, %%xmm3 \n\t"
"pshufd $0x39, %%xmm2, %%xmm7 \n\t"
"mulps %%xmm0, %%xmm2 \n\t"
"mulps %%xmm1, %%xmm3 \n\t"
" \n\t"
"addps %%xmm4, %%xmm11 \n\t"
"addps %%xmm5, %%xmm15 \n\t"
"movaps %%xmm7, %%xmm5 \n\t"
"pshufd $0x39, %%xmm7, %%xmm6 \n\t"
"mulps %%xmm0, %%xmm7 \n\t"
"mulps %%xmm1, %%xmm5 \n\t"
" \n\t"
"addps %%xmm2, %%xmm8 \n\t"
"movaps -7 * 16(%%rbx), %%xmm2 \n\t"
"addps %%xmm3, %%xmm12 \n\t"
"movaps %%xmm6, %%xmm3 \n\t"
"pshufd $0x39, %%xmm6, %%xmm4 \n\t"
"mulps %%xmm0, %%xmm6 \n\t"
"mulps %%xmm1, %%xmm3 \n\t"
" \n\t"
"addps %%xmm7, %%xmm9 \n\t"
"addps %%xmm5, %%xmm13 \n\t"
"movaps %%xmm4, %%xmm5 \n\t"
"mulps %%xmm0, %%xmm4 \n\t"
"movaps -6 * 16(%%rax), %%xmm0 \n\t"
"mulps %%xmm1, %%xmm5 \n\t"
"movaps -5 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
"subq $-1 * 8 * 4, %%rax \n\t" // a += 8 (1 x mr)
"subq $-1 * 4 * 4, %%rbx \n\t" // b += 4 (1 x nr)
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1;
"jne .SLOOPKLEFT \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".SPOSTACCUM: \n\t"
" \n\t"
"addps %%xmm6, %%xmm10 \n\t"
"addps %%xmm3, %%xmm14 \n\t"
"addps %%xmm4, %%xmm11 \n\t"
"addps %%xmm5, %%xmm15 \n\t"
" \n\t"
" \n\t"
"movq %4, %%rax \n\t" // load address of alpha
"movq %5, %%rbx \n\t" // load address of beta
"movss (%%rax), %%xmm6 \n\t" // load alpha to bottom 4 bytes of xmm6
"movss (%%rbx), %%xmm7 \n\t" // load beta to bottom 4 bytes of xmm7
"pshufd $0x00, %%xmm6, %%xmm6 \n\t" // populate xmm6 with four alphas
"pshufd $0x00, %%xmm7, %%xmm7 \n\t" // populate xmm7 with four betas
" \n\t"
" \n\t"
"movq %7, %%rsi \n\t" // load rs_c
"movq %%rsi, %%r8 \n\t" // make a copy of rs_c
" \n\t"
"leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float)
"leaq (%%rsi,%%rsi,2), %%r11 \n\t" // r11 = 3*(rs_c * sizeof(float))
" \n\t"
"leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c;
" \n\t"
" \n\t" // xmm8: xmm9: xmm10: xmm11:
" \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
" \n\t" // ab11 ab12 ab13 ab10
" \n\t" // ab22 ab23 ab20 ab21
" \n\t" // ab33 ) ab30 ) ab31 ) ab32 )
" \n\t" //
" \n\t" // xmm12: xmm13: xmm14: xmm15:
" \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
" \n\t" // ab51 ab52 ab53 ab50
" \n\t" // ab62 ab63 ab60 ab61
" \n\t" // ab73 ) ab70 ) ab71 ) ab72 )
"movaps %%xmm9, %%xmm4 \n\t"
"shufps $0xd8, %%xmm8, %%xmm9 \n\t"
"shufps $0xd8, %%xmm11, %%xmm8 \n\t"
"shufps $0xd8, %%xmm10, %%xmm11\n\t"
"shufps $0xd8, %%xmm4, %%xmm10\n\t"
" \n\t"
"movaps %%xmm8, %%xmm4 \n\t"
"shufps $0xd8, %%xmm10, %%xmm8 \n\t"
"shufps $0xd8, %%xmm4, %%xmm10 \n\t"
"movaps %%xmm9, %%xmm5 \n\t"
"shufps $0xd8, %%xmm11, %%xmm9 \n\t"
"shufps $0xd8, %%xmm5, %%xmm11 \n\t"
" \n\t"
"movaps %%xmm13, %%xmm4 \n\t"
"shufps $0xd8, %%xmm12, %%xmm13\n\t"
"shufps $0xd8, %%xmm15, %%xmm12\n\t"
"shufps $0xd8, %%xmm14, %%xmm15\n\t"
"shufps $0xd8, %%xmm4, %%xmm14\n\t"
" \n\t"
"movaps %%xmm12, %%xmm4 \n\t"
"shufps $0xd8, %%xmm14, %%xmm12\n\t"
"shufps $0xd8, %%xmm4, %%xmm14 \n\t"
"movaps %%xmm13, %%xmm5 \n\t"
"shufps $0xd8, %%xmm15, %%xmm13\n\t"
"shufps $0xd8, %%xmm5, %%xmm15 \n\t"
" \n\t" // xmm8: xmm9: xmm10: xmm11:
" \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
" \n\t" // ab10 ab11 ab12 ab13
" \n\t" // ab20 ab21 ab22 ab23
" \n\t" // ab30 ) ab31 ) ab32 ) ab33 )
" \n\t" //
" \n\t" // xmm12: xmm13: xmm14: xmm15:
" \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
" \n\t" // ab50 ab51 ab52 ab53
" \n\t" // ab60 ab61 ab62 ab63
" \n\t" // ab70 ) ab71 ) ab72 ) ab73 )
" \n\t"
" \n\t"
" \n\t"
" \n\t" // determine if
" \n\t" // c % 16 == 0, AND
" \n\t" // rs_c == 1
" \n\t" // ie: aligned and column-stored
" \n\t"
"cmpq $1, %%r8 \n\t" // set ZF if rs_c == 1.
"sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
"testq $15, %%rcx \n\t" // set ZF if c & 16 is zero.
"setz %%bh \n\t" // bh = ( ZF == 1 ? 1 : 0 );
" \n\t" // and(bl,bh) will reveal result
" \n\t"
" \n\t" // now avoid loading C if beta == 0
" \n\t"
"xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero.
"ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0.
"je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
" \n\t"
" \n\t"
" \n\t" // check if aligned/column-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"jne .SCOLSTORED \n\t" // jump to column storage case
" \n\t"
" \n\t"
" \n\t"
".SGENSTORED: \n\t"
" \n\t"
"movlps (%%rcx ), %%xmm0 \n\t" // load c00 ~ c30
"movhps (%%rcx,%%rsi,1), %%xmm0 \n\t"
"movlps (%%rcx,%%rsi,2), %%xmm1 \n\t"
"movhps (%%rcx,%%r11 ), %%xmm1 \n\t"
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
" \n\t"
"mulps %%xmm6, %%xmm8 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm8, %%xmm0 \n\t" // add the gemm result,
" \n\t"
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
" \n\t"
"movlps (%%rdx ), %%xmm0 \n\t" // load c40 ~ c70
"movhps (%%rdx,%%rsi,1), %%xmm0 \n\t"
"movlps (%%rdx,%%rsi,2), %%xmm1 \n\t"
"movhps (%%rdx,%%r11 ), %%xmm1 \n\t"
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
" \n\t"
"mulps %%xmm6, %%xmm12 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm12, %%xmm0 \n\t" // add the gemm result,
" \n\t"
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"movlps (%%rcx ), %%xmm0 \n\t" // load c01 ~ c31
"movhps (%%rcx,%%rsi,1), %%xmm0 \n\t"
"movlps (%%rcx,%%rsi,2), %%xmm1 \n\t"
"movhps (%%rcx,%%r11 ), %%xmm1 \n\t"
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
" \n\t"
"mulps %%xmm6, %%xmm9 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm9, %%xmm0 \n\t" // add the gemm result,
" \n\t"
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
" \n\t"
"movlps (%%rdx ), %%xmm0 \n\t" // load c41 ~ c71
"movhps (%%rdx,%%rsi,1), %%xmm0 \n\t"
"movlps (%%rdx,%%rsi,2), %%xmm1 \n\t"
"movhps (%%rdx,%%r11 ), %%xmm1 \n\t"
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
" \n\t"
"mulps %%xmm6, %%xmm13 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm13, %%xmm0 \n\t" // add the gemm result,
" \n\t"
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"movlps (%%rcx ), %%xmm0 \n\t" // load c02 ~ c32
"movhps (%%rcx,%%rsi,1), %%xmm0 \n\t"
"movlps (%%rcx,%%rsi,2), %%xmm1 \n\t"
"movhps (%%rcx,%%r11 ), %%xmm1 \n\t"
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
" \n\t"
"mulps %%xmm6, %%xmm10 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm10, %%xmm0 \n\t" // add the gemm result,
" \n\t"
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
" \n\t"
"movlps (%%rdx ), %%xmm0 \n\t" // load c42 ~ c72
"movhps (%%rdx,%%rsi,1), %%xmm0 \n\t"
"movlps (%%rdx,%%rsi,2), %%xmm1 \n\t"
"movhps (%%rdx,%%r11 ), %%xmm1 \n\t"
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
" \n\t"
"mulps %%xmm6, %%xmm14 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm14, %%xmm0 \n\t" // add the gemm result,
" \n\t"
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
" \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"movlps (%%rcx ), %%xmm0 \n\t" // load c03 ~ c33
"movhps (%%rcx,%%rsi,1), %%xmm0 \n\t"
"movlps (%%rcx,%%rsi,2), %%xmm1 \n\t"
"movhps (%%rcx,%%r11 ), %%xmm1 \n\t"
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
" \n\t"
"mulps %%xmm6, %%xmm11 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm11, %%xmm0 \n\t" // add the gemm result,
" \n\t"
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"movlps (%%rdx ), %%xmm0 \n\t" // load c43 ~ c73
"movhps (%%rdx,%%rsi,1), %%xmm0 \n\t"
"movlps (%%rdx,%%rsi,2), %%xmm1 \n\t"
"movhps (%%rdx,%%r11 ), %%xmm1 \n\t"
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
" \n\t"
"mulps %%xmm6, %%xmm15 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm15, %%xmm0 \n\t" // add the gemm result,
" \n\t"
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"jmp .SDONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
".SCOLSTORED: \n\t"
" \n\t"
"movaps (%%rcx), %%xmm0 \n\t" // load c00 ~ c30,
"mulps %%xmm6, %%xmm8 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm8, %%xmm0 \n\t" // add the gemm result,
"movaps %%xmm0, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t"
" \n\t"
"movaps (%%rdx), %%xmm1 \n\t" // load c40 ~ c70,
"mulps %%xmm6, %%xmm12 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm1 \n\t" // scale by beta,
"addps %%xmm12, %%xmm1 \n\t" // add the gemm result,
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t"
"movaps (%%rcx), %%xmm0 \n\t" // load c01 ~ c31,
"mulps %%xmm6, %%xmm9 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm9, %%xmm0 \n\t" // add the gemm result,
"movaps %%xmm0, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t"
" \n\t"
"movaps (%%rdx), %%xmm1 \n\t" // load c41 ~ c71,
"mulps %%xmm6, %%xmm13 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm1 \n\t" // scale by beta,
"addps %%xmm13, %%xmm1 \n\t" // add the gemm result,
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t"
"movaps (%%rcx), %%xmm0 \n\t" // load c02 ~ c32,
"mulps %%xmm6, %%xmm10 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm10, %%xmm0 \n\t" // add the gemm result,
"movaps %%xmm0, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t"
" \n\t"
"movaps (%%rdx), %%xmm1 \n\t" // load c42 ~ c72,
"mulps %%xmm6, %%xmm14 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm1 \n\t" // scale by beta,
"addps %%xmm14, %%xmm1 \n\t" // add the gemm result,
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t"
"movaps (%%rcx), %%xmm0 \n\t" // load c03 ~ c33,
"mulps %%xmm6, %%xmm11 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
"addps %%xmm11, %%xmm0 \n\t" // add the gemm result,
"movaps %%xmm0, (%%rcx) \n\t" // and store back to memory.
" \n\t"
" \n\t"
"movaps (%%rdx), %%xmm1 \n\t" // load c43 ~ c73,
"mulps %%xmm6, %%xmm15 \n\t" // scale by alpha,
"mulps %%xmm7, %%xmm1 \n\t" // scale by beta,
"addps %%xmm15, %%xmm1 \n\t" // add the gemm result,
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
" \n\t"
"jmp .SDONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
" \n\t"
".SBETAZERO: \n\t"
" \n\t" // check if aligned/column-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"jne .SCOLSTORBZ \n\t" // jump to column storage case
" \n\t"
" \n\t"
" \n\t"
".SGENSTORBZ: \n\t"
" \n\t"
"jmp .SGENSTORED \n\t" // use gen-stored beta != 0 case for now
//"jmp .SDONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
".SCOLSTORBZ: \n\t"
" \n\t"
" \n\t" // skip loading c00 ~ c30,
"mulps %%xmm6, %%xmm8 \n\t" // scale by alpha,
"movaps %%xmm8, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t"
" \n\t" // skip loading c40 ~ c70,
"mulps %%xmm6, %%xmm12 \n\t" // scale by alpha,
"movaps %%xmm12, (%%rdx) \n\t" // and store back to memory.
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t" // skip loading c01 ~ c31,
"mulps %%xmm6, %%xmm9 \n\t" // scale by alpha,
"movaps %%xmm9, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t"
" \n\t" // skip loading c41 ~ c71,
"mulps %%xmm6, %%xmm13 \n\t" // scale by alpha,
"movaps %%xmm13, (%%rdx) \n\t" // and store back to memory.
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t" // skip loading c02 ~ c32,
"mulps %%xmm6, %%xmm10 \n\t" // scale by alpha,
"movaps %%xmm10, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t"
" \n\t" // skip loading c42 ~ c72,
"mulps %%xmm6, %%xmm14 \n\t" // scale by alpha,
"movaps %%xmm14, (%%rdx) \n\t" // and store back to memory.
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t" // skip loading c03 ~ c33,
"mulps %%xmm6, %%xmm11 \n\t" // scale by alpha,
"movaps %%xmm11, (%%rcx) \n\t" // and store back to memory.
" \n\t"
" \n\t" // skip loading c43 ~ c73,
"mulps %%xmm6, %%xmm15 \n\t" // scale by alpha,
"movaps %%xmm15, (%%rdx) \n\t" // and store back to memory.
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
".SDONE: \n\t"
" \n\t"
: // output operands (none)
: // input operands
"m" (k_iter),
"m" (k_left),
"m" (a),
"m" (b),
"m" (alpha),
"m" (beta),
"m" (c),
"m" (rs_c),
"m" (cs_c),
"m" (b_next)
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
}
void bli_dgemm_opt_d4x4(
@@ -109,11 +777,11 @@ void bli_dgemm_opt_d4x4(
" \n\t"
"movq %0, %%rsi \n\t" // i = k_iter;
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
"je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that
"je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that
" \n\t" // contains the k_left loop.
" \n\t"
" \n\t"
".LOOPKITER: \n\t" // MAIN LOOP
".DLOOPKITER: \n\t" // MAIN LOOP
" \n\t"
"prefetcht0 (4*35+1) * 8(%%rax) \n\t"
" \n\t"
@@ -252,19 +920,19 @@ void bli_dgemm_opt_d4x4(
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1;
"jne .LOOPKITER \n\t" // iterate again if i != 0.
"jne .DLOOPKITER \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".CONSIDERKLEFT: \n\t"
".DCONSIDKLEFT: \n\t"
" \n\t"
"movq %1, %%rsi \n\t" // i = k_left;
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
"je .POSTACCUM \n\t" // if i == 0, we're done; jump to end.
"je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
" \n\t" // else, we prepare to enter k_left loop.
" \n\t"
" \n\t"
".LOOPKLEFT: \n\t" // EDGE LOOP
".DLOOPKLEFT: \n\t" // EDGE LOOP
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 0
"movaps -7 * 16(%%rbx), %%xmm3 \n\t"
@@ -302,11 +970,11 @@ void bli_dgemm_opt_d4x4(
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1;
"jne .LOOPKLEFT \n\t" // iterate again if i != 0.
"jne .DLOOPKLEFT \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".POSTACCUM: \n\t"
".DPOSTACCUM: \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
@@ -322,7 +990,8 @@ void bli_dgemm_opt_d4x4(
" \n\t"
"movq %7, %%rsi \n\t" // load rs_c
"movq %%rsi, %%r8 \n\t" // make a copy of rs_c
"leaq (,%%rsi,8), %%rsi \n\t" // rs_c *= sizeof(double)
" \n\t"
"leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double)
" \n\t"
"leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c;
" \n\t"
@@ -373,16 +1042,16 @@ void bli_dgemm_opt_d4x4(
" \n\t"
"xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero.
"ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0.
"je .BETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
"je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
" \n\t"
" \n\t"
" \n\t" // check if aligned/column-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"jne .COLSTORED \n\t" // jump to column storage case
"jne .DCOLSTORED \n\t" // jump to column storage case
" \n\t"
" \n\t"
" \n\t"
".GENSTORED: \n\t"
".DGENSTORED: \n\t"
" \n\t"
"movlpd (%%rcx), %%xmm0 \n\t" // load c00 and c10,
"movhpd (%%rcx,%%rsi), %%xmm0 \n\t"
@@ -461,11 +1130,11 @@ void bli_dgemm_opt_d4x4(
"movlpd %%xmm1, (%%rdx) \n\t" // and store back to memory.
"movhpd %%xmm1, (%%rdx,%%rsi) \n\t"
" \n\t"
"jmp .DONE \n\t" // jump to end.
"jmp .DDONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
".COLSTORED: \n\t"
".DCOLSTORED: \n\t"
" \n\t"
"movaps (%%rcx), %%xmm0 \n\t" // load c00 and c10,
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
@@ -528,19 +1197,19 @@ void bli_dgemm_opt_d4x4(
"addpd %%xmm15, %%xmm1 \n\t" // add the gemm result,
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
" \n\t"
"jmp .DONE \n\t" // jump to end.
"jmp .DDONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
" \n\t"
".BETAZERO: \n\t"
".DBETAZERO: \n\t"
" \n\t" // check if aligned/column-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"jne .COLSTORBZ \n\t" // jump to column storage case
"jne .DCOLSTORBZ \n\t" // jump to column storage case
" \n\t"
" \n\t"
" \n\t"
".GENSTORBZ: \n\t"
".DGENSTORBZ: \n\t"
" \n\t" // skip loading c00 and c10,
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
"movlpd %%xmm8, (%%rcx) \n\t" // and store back to memory.
@@ -587,11 +1256,11 @@ void bli_dgemm_opt_d4x4(
"movlpd %%xmm15, (%%rdx) \n\t" // and store back to memory.
"movhpd %%xmm15, (%%rdx,%%rsi) \n\t"
" \n\t"
"jmp .DONE \n\t" // jump to end.
"jmp .DDONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
".COLSTORBZ: \n\t"
".DCOLSTORBZ: \n\t"
" \n\t"
" \n\t" // skip loading c00 and c10,
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
@@ -638,7 +1307,7 @@ void bli_dgemm_opt_d4x4(
" \n\t"
" \n\t"
" \n\t"
".DONE: \n\t"
".DDONE: \n\t"
" \n\t"
: // output operands (none)
@@ -661,7 +1330,6 @@ void bli_dgemm_opt_d4x4(
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
}
void bli_cgemm_opt_d4x4(