mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Added single/real gemm micro-kernel for x86_64.
Details: - Added a single-precision real gemm micro-kernel in kernels/x86_64/3/bli_gemm_opt_d4x4.c. - Adjusted the single-precision real register blocksizes in config/clarksville/bli_kernel.h to be 8x4. - Added a missing comment to bli_packm_blk_var2.c that was present in bli_packm_blk_var3.c
This commit is contained in:
@@ -54,7 +54,7 @@
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 256
|
||||
#define BLIS_DEFAULT_MC_S 768
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 8192
|
||||
|
||||
@@ -82,8 +82,8 @@
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/2)
|
||||
#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/2)
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/2)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/2)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
@@ -100,7 +100,7 @@
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 4
|
||||
|
||||
@@ -260,6 +260,9 @@ void PASTEMAC(ch,varname )( \
|
||||
{ \
|
||||
diagoffc_i_abs = bli_abs( diagoffc_i ); \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel, but we can probably still support those cases if
|
||||
it happens. */ \
|
||||
if ( ( bli_is_col_stored( rs_p, cs_p ) && diagoffc_i < 0 ) || \
|
||||
( bli_is_row_stored( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
|
||||
@@ -45,7 +45,675 @@ void bli_sgemm_opt_d4x4(
|
||||
float* restrict b_next
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
dim_t k_iter;
|
||||
dim_t k_left;
|
||||
|
||||
k_iter = k / 4;
|
||||
k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movq %2, %%rax \n\t" // load address of a.
|
||||
"movq %3, %%rbx \n\t" // load address of b.
|
||||
"movq %9, %%r9 \n\t" // load address of b_next.
|
||||
" \n\t"
|
||||
"subq $-8 * 16, %%rax \n\t" // increment pointers to allow byte
|
||||
"subq $-8 * 16, %%rbx \n\t" // offsets in the unrolled iterations.
|
||||
" \n\t"
|
||||
"movaps -8 * 16(%%rax), %%xmm0 \n\t" // initialize loop by pre-loading elements
|
||||
"movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b.
|
||||
"movaps -8 * 16(%%rbx), %%xmm2 \n\t"
|
||||
" \n\t"
|
||||
"movq %6, %%rcx \n\t" // load address of c
|
||||
"movq %8, %%rdi \n\t" // load cs_c
|
||||
"leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float)
|
||||
"leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c;
|
||||
" \n\t"
|
||||
"prefetcht2 0 * 4(%%r9) \n\t" // prefetch b_next
|
||||
" \n\t"
|
||||
"xorpd %%xmm3, %%xmm3 \n\t"
|
||||
"xorpd %%xmm4, %%xmm4 \n\t"
|
||||
"xorpd %%xmm5, %%xmm5 \n\t"
|
||||
"xorpd %%xmm6, %%xmm6 \n\t"
|
||||
" \n\t"
|
||||
"prefetcht0 6 * 4(%%rcx) \n\t" // prefetch c + 0*cs_c
|
||||
"xorpd %%xmm8, %%xmm8 \n\t"
|
||||
"movaps %%xmm8, %%xmm9 \n\t"
|
||||
"prefetcht0 6 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
|
||||
"movaps %%xmm8, %%xmm10 \n\t"
|
||||
"movaps %%xmm8, %%xmm11 \n\t"
|
||||
"prefetcht0 6 * 4(%%r10) \n\t" // prefetch c + 2*cs_c
|
||||
"movaps %%xmm8, %%xmm12 \n\t"
|
||||
"movaps %%xmm8, %%xmm13 \n\t"
|
||||
"prefetcht0 6 * 4(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
|
||||
"movaps %%xmm8, %%xmm14 \n\t"
|
||||
"movaps %%xmm8, %%xmm15 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movq %0, %%rsi \n\t" // i = k_iter;
|
||||
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
|
||||
"je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that
|
||||
" \n\t" // contains the k_left loop.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SLOOPKITER: \n\t" // MAIN LOOP
|
||||
" \n\t"
|
||||
"prefetcht0 (4*35+1) * 8(%%rax) \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm6, %%xmm10 \n\t" // iteration 0
|
||||
"addps %%xmm3, %%xmm14 \n\t"
|
||||
"movaps %%xmm2, %%xmm3 \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm7 \n\t"
|
||||
"mulps %%xmm0, %%xmm2 \n\t"
|
||||
"mulps %%xmm1, %%xmm3 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm4, %%xmm11 \n\t"
|
||||
"addps %%xmm5, %%xmm15 \n\t"
|
||||
"movaps %%xmm7, %%xmm5 \n\t"
|
||||
"pshufd $0x39, %%xmm7, %%xmm6 \n\t"
|
||||
"mulps %%xmm0, %%xmm7 \n\t"
|
||||
"mulps %%xmm1, %%xmm5 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm2, %%xmm8 \n\t"
|
||||
"movaps -7 * 16(%%rbx), %%xmm2 \n\t"
|
||||
"addps %%xmm3, %%xmm12 \n\t"
|
||||
"movaps %%xmm6, %%xmm3 \n\t"
|
||||
"pshufd $0x39, %%xmm6, %%xmm4 \n\t"
|
||||
"mulps %%xmm0, %%xmm6 \n\t"
|
||||
"mulps %%xmm1, %%xmm3 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm7, %%xmm9 \n\t"
|
||||
"addps %%xmm5, %%xmm13 \n\t"
|
||||
"movaps %%xmm4, %%xmm5 \n\t"
|
||||
"mulps %%xmm0, %%xmm4 \n\t"
|
||||
"movaps -6 * 16(%%rax), %%xmm0 \n\t"
|
||||
"mulps %%xmm1, %%xmm5 \n\t"
|
||||
"movaps -5 * 16(%%rax), %%xmm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm6, %%xmm10 \n\t" // iteration 1
|
||||
"addps %%xmm3, %%xmm14 \n\t"
|
||||
"movaps %%xmm2, %%xmm3 \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm7 \n\t"
|
||||
"mulps %%xmm0, %%xmm2 \n\t"
|
||||
"mulps %%xmm1, %%xmm3 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm4, %%xmm11 \n\t"
|
||||
"addps %%xmm5, %%xmm15 \n\t"
|
||||
"movaps %%xmm7, %%xmm5 \n\t"
|
||||
"pshufd $0x39, %%xmm7, %%xmm6 \n\t"
|
||||
"mulps %%xmm0, %%xmm7 \n\t"
|
||||
"mulps %%xmm1, %%xmm5 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm2, %%xmm8 \n\t"
|
||||
"movaps -6 * 16(%%rbx), %%xmm2 \n\t"
|
||||
"addps %%xmm3, %%xmm12 \n\t"
|
||||
"movaps %%xmm6, %%xmm3 \n\t"
|
||||
"pshufd $0x39, %%xmm6, %%xmm4 \n\t"
|
||||
"mulps %%xmm0, %%xmm6 \n\t"
|
||||
"mulps %%xmm1, %%xmm3 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm7, %%xmm9 \n\t"
|
||||
"addps %%xmm5, %%xmm13 \n\t"
|
||||
"movaps %%xmm4, %%xmm5 \n\t"
|
||||
"mulps %%xmm0, %%xmm4 \n\t"
|
||||
"movaps -4 * 16(%%rax), %%xmm0 \n\t"
|
||||
"mulps %%xmm1, %%xmm5 \n\t"
|
||||
"movaps -3 * 16(%%rax), %%xmm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm6, %%xmm10 \n\t" // iteration 2
|
||||
"addps %%xmm3, %%xmm14 \n\t"
|
||||
"movaps %%xmm2, %%xmm3 \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm7 \n\t"
|
||||
"mulps %%xmm0, %%xmm2 \n\t"
|
||||
"mulps %%xmm1, %%xmm3 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm4, %%xmm11 \n\t"
|
||||
"addps %%xmm5, %%xmm15 \n\t"
|
||||
"movaps %%xmm7, %%xmm5 \n\t"
|
||||
"pshufd $0x39, %%xmm7, %%xmm6 \n\t"
|
||||
"mulps %%xmm0, %%xmm7 \n\t"
|
||||
"mulps %%xmm1, %%xmm5 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm2, %%xmm8 \n\t"
|
||||
"movaps -5 * 16(%%rbx), %%xmm2 \n\t"
|
||||
"addps %%xmm3, %%xmm12 \n\t"
|
||||
"movaps %%xmm6, %%xmm3 \n\t"
|
||||
"pshufd $0x39, %%xmm6, %%xmm4 \n\t"
|
||||
"mulps %%xmm0, %%xmm6 \n\t"
|
||||
"mulps %%xmm1, %%xmm3 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm7, %%xmm9 \n\t"
|
||||
"addps %%xmm5, %%xmm13 \n\t"
|
||||
"movaps %%xmm4, %%xmm5 \n\t"
|
||||
"mulps %%xmm0, %%xmm4 \n\t"
|
||||
"movaps -2 * 16(%%rax), %%xmm0 \n\t"
|
||||
"mulps %%xmm1, %%xmm5 \n\t"
|
||||
"movaps -1 * 16(%%rax), %%xmm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm6, %%xmm10 \n\t" // iteration 3
|
||||
"addps %%xmm3, %%xmm14 \n\t"
|
||||
"movaps %%xmm2, %%xmm3 \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm7 \n\t"
|
||||
"mulps %%xmm0, %%xmm2 \n\t"
|
||||
"mulps %%xmm1, %%xmm3 \n\t"
|
||||
" \n\t"
|
||||
"subq $-4 * 8 * 4, %%rax \n\t" // a += 4*8 (unroll x mr)
|
||||
" \n\t"
|
||||
"addps %%xmm4, %%xmm11 \n\t"
|
||||
"addps %%xmm5, %%xmm15 \n\t"
|
||||
"movaps %%xmm7, %%xmm5 \n\t"
|
||||
"pshufd $0x39, %%xmm7, %%xmm6 \n\t"
|
||||
"mulps %%xmm0, %%xmm7 \n\t"
|
||||
"mulps %%xmm1, %%xmm5 \n\t"
|
||||
" \n\t"
|
||||
"subq $-4 * 4 * 4, %%r9 \n\t" // b_next += 4*4 (unroll x nr)
|
||||
" \n\t"
|
||||
"addps %%xmm2, %%xmm8 \n\t"
|
||||
"movaps -4 * 16(%%rbx), %%xmm2 \n\t"
|
||||
"addps %%xmm3, %%xmm12 \n\t"
|
||||
"movaps %%xmm6, %%xmm3 \n\t"
|
||||
"pshufd $0x39, %%xmm6, %%xmm4 \n\t"
|
||||
"mulps %%xmm0, %%xmm6 \n\t"
|
||||
"mulps %%xmm1, %%xmm3 \n\t"
|
||||
" \n\t"
|
||||
"subq $-4 * 4 * 4, %%rbx \n\t" // b += 4*4 (unroll x nr)
|
||||
" \n\t"
|
||||
"addps %%xmm7, %%xmm9 \n\t"
|
||||
"addps %%xmm5, %%xmm13 \n\t"
|
||||
"movaps %%xmm4, %%xmm5 \n\t"
|
||||
"mulps %%xmm0, %%xmm4 \n\t"
|
||||
"movaps -8 * 16(%%rax), %%xmm0 \n\t"
|
||||
"mulps %%xmm1, %%xmm5 \n\t"
|
||||
"movaps -7 * 16(%%rax), %%xmm1 \n\t"
|
||||
" \n\t"
|
||||
"prefetcht2 0 * 4(%%r9) \n\t" // prefetch b_next[0]
|
||||
"prefetcht2 16 * 4(%%r9) \n\t" // prefetch b_next[16]
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
"jne .SLOOPKITER \n\t" // iterate again if i != 0.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SCONSIDKLEFT: \n\t"
|
||||
" \n\t"
|
||||
"movq %1, %%rsi \n\t" // i = k_left;
|
||||
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
|
||||
"je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
|
||||
" \n\t" // else, we prepare to enter k_left loop.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SLOOPKLEFT: \n\t" // EDGE LOOP
|
||||
" \n\t"
|
||||
"addps %%xmm6, %%xmm10 \n\t" // iteration 0
|
||||
"addps %%xmm3, %%xmm14 \n\t"
|
||||
"movaps %%xmm2, %%xmm3 \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm7 \n\t"
|
||||
"mulps %%xmm0, %%xmm2 \n\t"
|
||||
"mulps %%xmm1, %%xmm3 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm4, %%xmm11 \n\t"
|
||||
"addps %%xmm5, %%xmm15 \n\t"
|
||||
"movaps %%xmm7, %%xmm5 \n\t"
|
||||
"pshufd $0x39, %%xmm7, %%xmm6 \n\t"
|
||||
"mulps %%xmm0, %%xmm7 \n\t"
|
||||
"mulps %%xmm1, %%xmm5 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm2, %%xmm8 \n\t"
|
||||
"movaps -7 * 16(%%rbx), %%xmm2 \n\t"
|
||||
"addps %%xmm3, %%xmm12 \n\t"
|
||||
"movaps %%xmm6, %%xmm3 \n\t"
|
||||
"pshufd $0x39, %%xmm6, %%xmm4 \n\t"
|
||||
"mulps %%xmm0, %%xmm6 \n\t"
|
||||
"mulps %%xmm1, %%xmm3 \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm7, %%xmm9 \n\t"
|
||||
"addps %%xmm5, %%xmm13 \n\t"
|
||||
"movaps %%xmm4, %%xmm5 \n\t"
|
||||
"mulps %%xmm0, %%xmm4 \n\t"
|
||||
"movaps -6 * 16(%%rax), %%xmm0 \n\t"
|
||||
"mulps %%xmm1, %%xmm5 \n\t"
|
||||
"movaps -5 * 16(%%rax), %%xmm1 \n\t"
|
||||
" \n\t"
|
||||
"subq $-1 * 8 * 4, %%rax \n\t" // a += 8 (1 x mr)
|
||||
"subq $-1 * 4 * 4, %%rbx \n\t" // b += 4 (1 x nr)
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
"jne .SLOOPKLEFT \n\t" // iterate again if i != 0.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SPOSTACCUM: \n\t"
|
||||
" \n\t"
|
||||
"addps %%xmm6, %%xmm10 \n\t"
|
||||
"addps %%xmm3, %%xmm14 \n\t"
|
||||
"addps %%xmm4, %%xmm11 \n\t"
|
||||
"addps %%xmm5, %%xmm15 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movq %4, %%rax \n\t" // load address of alpha
|
||||
"movq %5, %%rbx \n\t" // load address of beta
|
||||
"movss (%%rax), %%xmm6 \n\t" // load alpha to bottom 4 bytes of xmm6
|
||||
"movss (%%rbx), %%xmm7 \n\t" // load beta to bottom 4 bytes of xmm7
|
||||
"pshufd $0x00, %%xmm6, %%xmm6 \n\t" // populate xmm6 with four alphas
|
||||
"pshufd $0x00, %%xmm7, %%xmm7 \n\t" // populate xmm7 with four betas
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movq %7, %%rsi \n\t" // load rs_c
|
||||
"movq %%rsi, %%r8 \n\t" // make a copy of rs_c
|
||||
" \n\t"
|
||||
"leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float)
|
||||
"leaq (%%rsi,%%rsi,2), %%r11 \n\t" // r11 = 3*(rs_c * sizeof(float))
|
||||
" \n\t"
|
||||
"leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c;
|
||||
" \n\t"
|
||||
" \n\t" // xmm8: xmm9: xmm10: xmm11:
|
||||
" \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
|
||||
" \n\t" // ab11 ab12 ab13 ab10
|
||||
" \n\t" // ab22 ab23 ab20 ab21
|
||||
" \n\t" // ab33 ) ab30 ) ab31 ) ab32 )
|
||||
" \n\t" //
|
||||
" \n\t" // xmm12: xmm13: xmm14: xmm15:
|
||||
" \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
|
||||
" \n\t" // ab51 ab52 ab53 ab50
|
||||
" \n\t" // ab62 ab63 ab60 ab61
|
||||
" \n\t" // ab73 ) ab70 ) ab71 ) ab72 )
|
||||
"movaps %%xmm9, %%xmm4 \n\t"
|
||||
"shufps $0xd8, %%xmm8, %%xmm9 \n\t"
|
||||
"shufps $0xd8, %%xmm11, %%xmm8 \n\t"
|
||||
"shufps $0xd8, %%xmm10, %%xmm11\n\t"
|
||||
"shufps $0xd8, %%xmm4, %%xmm10\n\t"
|
||||
" \n\t"
|
||||
"movaps %%xmm8, %%xmm4 \n\t"
|
||||
"shufps $0xd8, %%xmm10, %%xmm8 \n\t"
|
||||
"shufps $0xd8, %%xmm4, %%xmm10 \n\t"
|
||||
"movaps %%xmm9, %%xmm5 \n\t"
|
||||
"shufps $0xd8, %%xmm11, %%xmm9 \n\t"
|
||||
"shufps $0xd8, %%xmm5, %%xmm11 \n\t"
|
||||
" \n\t"
|
||||
"movaps %%xmm13, %%xmm4 \n\t"
|
||||
"shufps $0xd8, %%xmm12, %%xmm13\n\t"
|
||||
"shufps $0xd8, %%xmm15, %%xmm12\n\t"
|
||||
"shufps $0xd8, %%xmm14, %%xmm15\n\t"
|
||||
"shufps $0xd8, %%xmm4, %%xmm14\n\t"
|
||||
" \n\t"
|
||||
"movaps %%xmm12, %%xmm4 \n\t"
|
||||
"shufps $0xd8, %%xmm14, %%xmm12\n\t"
|
||||
"shufps $0xd8, %%xmm4, %%xmm14 \n\t"
|
||||
"movaps %%xmm13, %%xmm5 \n\t"
|
||||
"shufps $0xd8, %%xmm15, %%xmm13\n\t"
|
||||
"shufps $0xd8, %%xmm5, %%xmm15 \n\t"
|
||||
" \n\t" // xmm8: xmm9: xmm10: xmm11:
|
||||
" \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
|
||||
" \n\t" // ab10 ab11 ab12 ab13
|
||||
" \n\t" // ab20 ab21 ab22 ab23
|
||||
" \n\t" // ab30 ) ab31 ) ab32 ) ab33 )
|
||||
" \n\t" //
|
||||
" \n\t" // xmm12: xmm13: xmm14: xmm15:
|
||||
" \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
|
||||
" \n\t" // ab50 ab51 ab52 ab53
|
||||
" \n\t" // ab60 ab61 ab62 ab63
|
||||
" \n\t" // ab70 ) ab71 ) ab72 ) ab73 )
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // determine if
|
||||
" \n\t" // c % 16 == 0, AND
|
||||
" \n\t" // rs_c == 1
|
||||
" \n\t" // ie: aligned and column-stored
|
||||
" \n\t"
|
||||
"cmpq $1, %%r8 \n\t" // set ZF if rs_c == 1.
|
||||
"sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
|
||||
"testq $15, %%rcx \n\t" // set ZF if c & 16 is zero.
|
||||
"setz %%bh \n\t" // bh = ( ZF == 1 ? 1 : 0 );
|
||||
" \n\t" // and(bl,bh) will reveal result
|
||||
" \n\t"
|
||||
" \n\t" // now avoid loading C if beta == 0
|
||||
" \n\t"
|
||||
"xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero.
|
||||
"ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0.
|
||||
"je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // check if aligned/column-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"jne .SCOLSTORED \n\t" // jump to column storage case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SGENSTORED: \n\t"
|
||||
" \n\t"
|
||||
"movlps (%%rcx ), %%xmm0 \n\t" // load c00 ~ c30
|
||||
"movhps (%%rcx,%%rsi,1), %%xmm0 \n\t"
|
||||
"movlps (%%rcx,%%rsi,2), %%xmm1 \n\t"
|
||||
"movhps (%%rcx,%%r11 ), %%xmm1 \n\t"
|
||||
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm8 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm8, %%xmm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movlps (%%rdx ), %%xmm0 \n\t" // load c40 ~ c70
|
||||
"movhps (%%rdx,%%rsi,1), %%xmm0 \n\t"
|
||||
"movlps (%%rdx,%%rsi,2), %%xmm1 \n\t"
|
||||
"movhps (%%rdx,%%r11 ), %%xmm1 \n\t"
|
||||
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm12 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm12, %%xmm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movlps (%%rcx ), %%xmm0 \n\t" // load c01 ~ c31
|
||||
"movhps (%%rcx,%%rsi,1), %%xmm0 \n\t"
|
||||
"movlps (%%rcx,%%rsi,2), %%xmm1 \n\t"
|
||||
"movhps (%%rcx,%%r11 ), %%xmm1 \n\t"
|
||||
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm9 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm9, %%xmm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movlps (%%rdx ), %%xmm0 \n\t" // load c41 ~ c71
|
||||
"movhps (%%rdx,%%rsi,1), %%xmm0 \n\t"
|
||||
"movlps (%%rdx,%%rsi,2), %%xmm1 \n\t"
|
||||
"movhps (%%rdx,%%r11 ), %%xmm1 \n\t"
|
||||
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm13 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm13, %%xmm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movlps (%%rcx ), %%xmm0 \n\t" // load c02 ~ c32
|
||||
"movhps (%%rcx,%%rsi,1), %%xmm0 \n\t"
|
||||
"movlps (%%rcx,%%rsi,2), %%xmm1 \n\t"
|
||||
"movhps (%%rcx,%%r11 ), %%xmm1 \n\t"
|
||||
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm10 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm10, %%xmm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movlps (%%rdx ), %%xmm0 \n\t" // load c42 ~ c72
|
||||
"movhps (%%rdx,%%rsi,1), %%xmm0 \n\t"
|
||||
"movlps (%%rdx,%%rsi,2), %%xmm1 \n\t"
|
||||
"movhps (%%rdx,%%r11 ), %%xmm1 \n\t"
|
||||
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm14 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm14, %%xmm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movlps (%%rcx ), %%xmm0 \n\t" // load c03 ~ c33
|
||||
"movhps (%%rcx,%%rsi,1), %%xmm0 \n\t"
|
||||
"movlps (%%rcx,%%rsi,2), %%xmm1 \n\t"
|
||||
"movhps (%%rcx,%%r11 ), %%xmm1 \n\t"
|
||||
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm11 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm11, %%xmm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rcx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rcx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rcx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rcx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movlps (%%rdx ), %%xmm0 \n\t" // load c43 ~ c73
|
||||
"movhps (%%rdx,%%rsi,1), %%xmm0 \n\t"
|
||||
"movlps (%%rdx,%%rsi,2), %%xmm1 \n\t"
|
||||
"movhps (%%rdx,%%r11 ), %%xmm1 \n\t"
|
||||
"shufps $0x88, %%xmm1, %%xmm0 \n\t"
|
||||
" \n\t"
|
||||
"mulps %%xmm6, %%xmm15 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm15, %%xmm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"movss %%xmm0, (%%rdx ) \n\t" // and store back to memory.
|
||||
"pshufd $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"movss %%xmm1, (%%rdx,%%rsi,1) \n\t"
|
||||
"pshufd $0x39, %%xmm1, %%xmm2 \n\t"
|
||||
"movss %%xmm2, (%%rdx,%%rsi,2) \n\t"
|
||||
"pshufd $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"movss %%xmm3, (%%rdx,%%r11 ) \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"jmp .SDONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SCOLSTORED: \n\t"
|
||||
" \n\t"
|
||||
"movaps (%%rcx), %%xmm0 \n\t" // load c00 ~ c30,
|
||||
"mulps %%xmm6, %%xmm8 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm8, %%xmm0 \n\t" // add the gemm result,
|
||||
"movaps %%xmm0, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t"
|
||||
"movaps (%%rdx), %%xmm1 \n\t" // load c40 ~ c70,
|
||||
"mulps %%xmm6, %%xmm12 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm1 \n\t" // scale by beta,
|
||||
"addps %%xmm12, %%xmm1 \n\t" // add the gemm result,
|
||||
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movaps (%%rcx), %%xmm0 \n\t" // load c01 ~ c31,
|
||||
"mulps %%xmm6, %%xmm9 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm9, %%xmm0 \n\t" // add the gemm result,
|
||||
"movaps %%xmm0, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t"
|
||||
"movaps (%%rdx), %%xmm1 \n\t" // load c41 ~ c71,
|
||||
"mulps %%xmm6, %%xmm13 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm1 \n\t" // scale by beta,
|
||||
"addps %%xmm13, %%xmm1 \n\t" // add the gemm result,
|
||||
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movaps (%%rcx), %%xmm0 \n\t" // load c02 ~ c32,
|
||||
"mulps %%xmm6, %%xmm10 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm10, %%xmm0 \n\t" // add the gemm result,
|
||||
"movaps %%xmm0, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t"
|
||||
"movaps (%%rdx), %%xmm1 \n\t" // load c42 ~ c72,
|
||||
"mulps %%xmm6, %%xmm14 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm1 \n\t" // scale by beta,
|
||||
"addps %%xmm14, %%xmm1 \n\t" // add the gemm result,
|
||||
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movaps (%%rcx), %%xmm0 \n\t" // load c03 ~ c33,
|
||||
"mulps %%xmm6, %%xmm11 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm0 \n\t" // scale by beta,
|
||||
"addps %%xmm11, %%xmm0 \n\t" // add the gemm result,
|
||||
"movaps %%xmm0, (%%rcx) \n\t" // and store back to memory.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movaps (%%rdx), %%xmm1 \n\t" // load c43 ~ c73,
|
||||
"mulps %%xmm6, %%xmm15 \n\t" // scale by alpha,
|
||||
"mulps %%xmm7, %%xmm1 \n\t" // scale by beta,
|
||||
"addps %%xmm15, %%xmm1 \n\t" // add the gemm result,
|
||||
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
|
||||
" \n\t"
|
||||
"jmp .SDONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SBETAZERO: \n\t"
|
||||
" \n\t" // check if aligned/column-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"jne .SCOLSTORBZ \n\t" // jump to column storage case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SGENSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
"jmp .SGENSTORED \n\t" // use gen-stored beta != 0 case for now
|
||||
//"jmp .SDONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SCOLSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c00 ~ c30,
|
||||
"mulps %%xmm6, %%xmm8 \n\t" // scale by alpha,
|
||||
"movaps %%xmm8, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t" // skip loading c40 ~ c70,
|
||||
"mulps %%xmm6, %%xmm12 \n\t" // scale by alpha,
|
||||
"movaps %%xmm12, (%%rdx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c01 ~ c31,
|
||||
"mulps %%xmm6, %%xmm9 \n\t" // scale by alpha,
|
||||
"movaps %%xmm9, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t" // skip loading c41 ~ c71,
|
||||
"mulps %%xmm6, %%xmm13 \n\t" // scale by alpha,
|
||||
"movaps %%xmm13, (%%rdx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c02 ~ c32,
|
||||
"mulps %%xmm6, %%xmm10 \n\t" // scale by alpha,
|
||||
"movaps %%xmm10, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
" \n\t" // skip loading c42 ~ c72,
|
||||
"mulps %%xmm6, %%xmm14 \n\t" // scale by alpha,
|
||||
"movaps %%xmm14, (%%rdx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c03 ~ c33,
|
||||
"mulps %%xmm6, %%xmm11 \n\t" // scale by alpha,
|
||||
"movaps %%xmm11, (%%rcx) \n\t" // and store back to memory.
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c43 ~ c73,
|
||||
"mulps %%xmm6, %%xmm15 \n\t" // scale by alpha,
|
||||
"movaps %%xmm15, (%%rdx) \n\t" // and store back to memory.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SDONE: \n\t"
|
||||
" \n\t"
|
||||
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter),
|
||||
"m" (k_left),
|
||||
"m" (a),
|
||||
"m" (b),
|
||||
"m" (alpha),
|
||||
"m" (beta),
|
||||
"m" (c),
|
||||
"m" (rs_c),
|
||||
"m" (cs_c),
|
||||
"m" (b_next)
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
"xmm4", "xmm5", "xmm6", "xmm7",
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
}
|
||||
|
||||
void bli_dgemm_opt_d4x4(
|
||||
@@ -109,11 +777,11 @@ void bli_dgemm_opt_d4x4(
|
||||
" \n\t"
|
||||
"movq %0, %%rsi \n\t" // i = k_iter;
|
||||
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
|
||||
"je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that
|
||||
"je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that
|
||||
" \n\t" // contains the k_left loop.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".LOOPKITER: \n\t" // MAIN LOOP
|
||||
".DLOOPKITER: \n\t" // MAIN LOOP
|
||||
" \n\t"
|
||||
"prefetcht0 (4*35+1) * 8(%%rax) \n\t"
|
||||
" \n\t"
|
||||
@@ -252,19 +920,19 @@ void bli_dgemm_opt_d4x4(
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
"jne .LOOPKITER \n\t" // iterate again if i != 0.
|
||||
"jne .DLOOPKITER \n\t" // iterate again if i != 0.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".CONSIDERKLEFT: \n\t"
|
||||
".DCONSIDKLEFT: \n\t"
|
||||
" \n\t"
|
||||
"movq %1, %%rsi \n\t" // i = k_left;
|
||||
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
|
||||
"je .POSTACCUM \n\t" // if i == 0, we're done; jump to end.
|
||||
"je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
|
||||
" \n\t" // else, we prepare to enter k_left loop.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".LOOPKLEFT: \n\t" // EDGE LOOP
|
||||
".DLOOPKLEFT: \n\t" // EDGE LOOP
|
||||
" \n\t"
|
||||
"addpd %%xmm3, %%xmm11 \n\t" // iteration 0
|
||||
"movaps -7 * 16(%%rbx), %%xmm3 \n\t"
|
||||
@@ -302,11 +970,11 @@ void bli_dgemm_opt_d4x4(
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
"jne .LOOPKLEFT \n\t" // iterate again if i != 0.
|
||||
"jne .DLOOPKLEFT \n\t" // iterate again if i != 0.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".POSTACCUM: \n\t"
|
||||
".DPOSTACCUM: \n\t"
|
||||
" \n\t"
|
||||
"addpd %%xmm3, %%xmm11 \n\t"
|
||||
"addpd %%xmm4, %%xmm15 \n\t"
|
||||
@@ -322,7 +990,8 @@ void bli_dgemm_opt_d4x4(
|
||||
" \n\t"
|
||||
"movq %7, %%rsi \n\t" // load rs_c
|
||||
"movq %%rsi, %%r8 \n\t" // make a copy of rs_c
|
||||
"leaq (,%%rsi,8), %%rsi \n\t" // rs_c *= sizeof(double)
|
||||
" \n\t"
|
||||
"leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double)
|
||||
" \n\t"
|
||||
"leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c;
|
||||
" \n\t"
|
||||
@@ -373,16 +1042,16 @@ void bli_dgemm_opt_d4x4(
|
||||
" \n\t"
|
||||
"xorpd %%xmm0, %%xmm0 \n\t" // set xmm0 to zero.
|
||||
"ucomisd %%xmm0, %%xmm7 \n\t" // check if beta == 0.
|
||||
"je .BETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
|
||||
"je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // check if aligned/column-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"jne .COLSTORED \n\t" // jump to column storage case
|
||||
"jne .DCOLSTORED \n\t" // jump to column storage case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".GENSTORED: \n\t"
|
||||
".DGENSTORED: \n\t"
|
||||
" \n\t"
|
||||
"movlpd (%%rcx), %%xmm0 \n\t" // load c00 and c10,
|
||||
"movhpd (%%rcx,%%rsi), %%xmm0 \n\t"
|
||||
@@ -461,11 +1130,11 @@ void bli_dgemm_opt_d4x4(
|
||||
"movlpd %%xmm1, (%%rdx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm1, (%%rdx,%%rsi) \n\t"
|
||||
" \n\t"
|
||||
"jmp .DONE \n\t" // jump to end.
|
||||
"jmp .DDONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".COLSTORED: \n\t"
|
||||
".DCOLSTORED: \n\t"
|
||||
" \n\t"
|
||||
"movaps (%%rcx), %%xmm0 \n\t" // load c00 and c10,
|
||||
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
|
||||
@@ -528,19 +1197,19 @@ void bli_dgemm_opt_d4x4(
|
||||
"addpd %%xmm15, %%xmm1 \n\t" // add the gemm result,
|
||||
"movaps %%xmm1, (%%rdx) \n\t" // and store back to memory.
|
||||
" \n\t"
|
||||
"jmp .DONE \n\t" // jump to end.
|
||||
"jmp .DDONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".BETAZERO: \n\t"
|
||||
".DBETAZERO: \n\t"
|
||||
" \n\t" // check if aligned/column-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"jne .COLSTORBZ \n\t" // jump to column storage case
|
||||
"jne .DCOLSTORBZ \n\t" // jump to column storage case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".GENSTORBZ: \n\t"
|
||||
".DGENSTORBZ: \n\t"
|
||||
" \n\t" // skip loading c00 and c10,
|
||||
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
|
||||
"movlpd %%xmm8, (%%rcx) \n\t" // and store back to memory.
|
||||
@@ -587,11 +1256,11 @@ void bli_dgemm_opt_d4x4(
|
||||
"movlpd %%xmm15, (%%rdx) \n\t" // and store back to memory.
|
||||
"movhpd %%xmm15, (%%rdx,%%rsi) \n\t"
|
||||
" \n\t"
|
||||
"jmp .DONE \n\t" // jump to end.
|
||||
"jmp .DDONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".COLSTORBZ: \n\t"
|
||||
".DCOLSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // skip loading c00 and c10,
|
||||
"mulpd %%xmm6, %%xmm8 \n\t" // scale by alpha,
|
||||
@@ -638,7 +1307,7 @@ void bli_dgemm_opt_d4x4(
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".DONE: \n\t"
|
||||
".DDONE: \n\t"
|
||||
" \n\t"
|
||||
|
||||
: // output operands (none)
|
||||
@@ -661,7 +1330,6 @@ void bli_dgemm_opt_d4x4(
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
void bli_cgemm_opt_d4x4(
|
||||
|
||||
Reference in New Issue
Block a user